Exemplo n.º 1
0
def load_transifex_source_terms(glossary_file):
    """Loads a list of source terms with their comments and word classes as a
    defaultdict(GlossaryEntry) from the given Transifex glossary csv file."""
    result = defaultdict(GlossaryEntry)
    counter = 0
    term_index = 0
    term_comment_index = 0
    wordclass_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'term':
                    term_index = colum_counter
                elif header == 'comment':
                    term_comment_index = colum_counter
                elif header == 'pos':
                    wordclass_index = colum_counter
                colum_counter = colum_counter + 1
        # Parse the entry
        else:
            entry = GlossaryEntry()
            entry.term = row[term_index].strip()
            entry.term_comment = row[term_comment_index].strip()
            entry.wordclass = row[wordclass_index].strip()
            result[entry.term] = entry
        counter = counter + 1
    return result
Exemplo n.º 2
0
def load_extracted_glossary(glossary_file, locale):
    """Build a defaultdict(GlossaryEntry) glossary from the given extracted
    glossary csv file for the given locale, raising an error for entries that
    have no translation."""
    result = defaultdict(GlossaryEntry)
    counter = 0
    term_index = 0
    comment_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'source':
                    term_index = colum_counter
                elif header == 'target':
                    translation_index = colum_counter
                colum_counter = colum_counter + 1
        # If there is a translation, parse the entry
        elif row[translation_index].strip() != '':
            if translation_index == 0:
                raise Exception(
                    'Glossary extracted for %s contains no translations.' % locale)
            entry = GlossaryEntry()
            entry.term = row[term_index].strip()
            entry.translation = row[translation_index].strip()
            # Remove source information with fuzzy matches
            regex = re.compile('(.+)( \{.*\})(.*)')
            match = regex.match(entry.translation)
            while match:
                entry.translation = match.group(1) + match.group(3)
                match = regex.match(entry.translation)
            result[entry.term] = entry
        counter = counter + 1
    return result
Exemplo n.º 3
0
def load_extracted_glossary(glossary_file, locale):
    """Build a defaultdict(GlossaryEntry) glossary from the given extracted
    glossary csv file for the given locale, raising an error for entries that
    have no translation."""
    result = defaultdict(GlossaryEntry)
    counter = 0
    term_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'source':
                    term_index = colum_counter
                elif header == 'target':
                    translation_index = colum_counter
                colum_counter = colum_counter + 1
        # If there is a translation, parse the entry
        elif row[translation_index].strip() != '':
            if translation_index == 0:
                raise Exception(
                    'Glossary extracted for %s contains no translations.' % locale)
            entry = GlossaryEntry()
            entry.term = row[term_index].strip()
            entry.translation = row[translation_index].strip()
            # Remove source information with fuzzy matches
            regex = re.compile(r'(.+)( \{.*\})(.*)')
            match = regex.match(entry.translation)
            while match:
                entry.translation = match.group(1) + match.group(3)
                match = regex.match(entry.translation)
            result[entry.term] = entry
        counter = counter + 1
    return result
Exemplo n.º 4
0
def load_transifex_source_terms(glossary_file):
    """Loads a list of source terms with their comments and word classes as a
    defaultdict(GlossaryEntry) from the given Transifex glossary csv file."""
    result = defaultdict(GlossaryEntry)
    counter = 0
    term_index = 0
    term_comment_index = 0
    wordclass_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'term':
                    term_index = colum_counter
                elif header == 'comment':
                    term_comment_index = colum_counter
                elif header == 'pos':
                    wordclass_index = colum_counter
                colum_counter = colum_counter + 1
        # Parse the entry
        else:
            entry = GlossaryEntry()
            entry.term = row[term_index].strip()
            entry.term_comment = row[term_comment_index].strip()
            entry.wordclass = row[wordclass_index].strip()
            result[entry.term] = entry
        counter = counter + 1
    return result
Exemplo n.º 5
0
def check_file(csv_file, glossaries, locale, po_file):
    """Run the actual check."""
    translations = read_csv_file(csv_file)
    source_index = 0
    target_index = 0
    location_index = 0
    hits = []
    counter = 0
    has_hunspell = True
    hunspell_locale = get_hunspell_locale(locale)
    for row in translations:
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'source':
                    source_index = colum_counter
                elif header == 'target':
                    target_index = colum_counter
                elif header == 'location':
                    location_index = colum_counter
                colum_counter = colum_counter + 1
        else:
            for entry in glossaries[locale][0]:
                # Check if the source text contains the glossary term.
                # Filter out superstrings, e.g. we don't want to check
                # "arena" against "battle arena"
                if source_contains_term(row[source_index], entry,
                                        glossaries[locale][0]):
                    # Skip empty translations
                    if row[target_index] == '':
                        continue
                    # Now verify the translation against all translation
                    # variations from the glossary
                    term_found = translation_has_term(entry, row[target_index])
                    # Add Hunspell stems for better matches and try again
                    # We do it here because the Hunspell manipulation is slow.
                    if not term_found and hunspell_locale != '':
                        target_to_check = append_hunspell_stems(
                            hunspell_locale, row[target_index])
                        term_found = translation_has_term(
                            entry, target_to_check)
                    if not term_found:
                        hit = FailedTranslation()
                        hit.source = row[source_index]
                        hit.target = row[target_index]
                        hit.location = row[location_index]
                        hit.term = entry.terms[0]
                        hit.translation = entry.translations[0]
                        hit.locale = locale
                        hit.po_file = po_file
                        hits.append(hit)
        counter = counter + 1
    return hits
Exemplo n.º 6
0
def check_file(csv_file, glossaries, locale, po_file):
    """Run the actual check."""
    translations = read_csv_file(csv_file)
    source_index = 0
    target_index = 0
    location_index = 0
    hits = []
    counter = 0
    has_hunspell = True
    hunspell_locale = get_hunspell_locale(locale)
    for row in translations:
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'source':
                    source_index = colum_counter
                elif header == 'target':
                    target_index = colum_counter
                elif header == 'location':
                    location_index = colum_counter
                colum_counter = colum_counter + 1
        else:
            for entry in glossaries[locale][0]:
                # Check if the source text contains the glossary term.
                # Filter out superstrings, e.g. we don't want to check
                # "arena" against "battle arena"
                if source_contains_term(row[source_index], entry, glossaries[locale][0]):
                    # Skip empty translations
                    if row[target_index] == '':
                        continue
                    # Now verify the translation against all translation
                    # variations from the glossary
                    term_found = translation_has_term(entry, row[target_index])
                    # Add Hunspell stems for better matches and try again
                    # We do it here because the Hunspell manipulation is slow.
                    if not term_found and hunspell_locale != '':
                        target_to_check = append_hunspell_stems(
                            hunspell_locale, row[target_index])
                        term_found = translation_has_term(
                            entry, target_to_check)
                    if not term_found:
                        hit = FailedTranslation()
                        hit.source = row[source_index]
                        hit.target = row[target_index]
                        hit.location = row[location_index]
                        hit.term = entry.terms[0]
                        hit.translation = entry.translations[0]
                        hit.locale = locale
                        hit.po_file = po_file
                        hits.append(hit)
        counter = counter + 1
    return hits
Exemplo n.º 7
0
def load_transifex_glossary(glossary_file, locale):
    """Build a defaultdict(GlossaryEntry) glossary from the given Transifex
    glossary csv file for the given locale.

    Include empty translations in the result
    """
    result = defaultdict(GlossaryEntry)
    counter = 0
    term_index = 0
    term_comment_index = 0
    translation_index = 0
    comment_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'term':
                    term_index = colum_counter
                elif header == 'comment':
                    term_comment_index = colum_counter
                elif header == 'translation_' + locale or header == locale:
                    translation_index = colum_counter
                elif header == 'comment_' + locale:
                    comment_index = colum_counter
                colum_counter = colum_counter + 1
        # Parse the entry
        else:
            if translation_index == 0:
                raise Exception('Locale %s is missing from glossary file.' %
                                locale)
            if comment_index == 0:
                raise Exception(
                    'Comment field for locale %s is missing from glossary file.'
                    % locale)
            entry = GlossaryEntry()
            entry.term = row[term_index].strip()
            entry.term_comment = row[term_comment_index].strip()
            entry.translation = row[translation_index].strip()
            entry.translation_comment = row[comment_index].strip()
            result[entry.term] = entry
        counter = counter + 1
    return result
Exemplo n.º 8
0
def load_transifex_glossary(glossary_file, locale):
    """Build a defaultdict(GlossaryEntry) glossary from the given Transifex
    glossary csv file for the given locale.

    Include empty translations in the result
    """
    result = defaultdict(GlossaryEntry)
    counter = 0
    term_index = 0
    term_comment_index = 0
    translation_index = 0
    comment_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'term':
                    term_index = colum_counter
                elif header == 'comment':
                    term_comment_index = colum_counter
                elif header == 'translation_' + locale or header == locale:
                    translation_index = colum_counter
                elif header == 'comment_' + locale:
                    comment_index = colum_counter
                colum_counter = colum_counter + 1
        # Parse the entry
        else:
            if translation_index == 0:
                raise Exception(
                    'Locale %s is missing from glossary file.' % locale)
            if comment_index == 0:
                raise Exception(
                    'Comment field for locale %s is missing from glossary file.' % locale)
            entry = GlossaryEntry()
            entry.term = row[term_index].strip()
            entry.term_comment = row[term_comment_index].strip()
            entry.translation = row[translation_index].strip()
            entry.translation_comment = row[comment_index].strip()
            result[entry.term] = entry
        counter = counter + 1
    return result
Exemplo n.º 9
0
def generate_glossary(po_dir, output_path, input_glossary, output_glossary,
                      only_locale):
    """Main loop.

    Uses poterminology from the Translate Toolkit to collect glossary entries for all files in 'po_dir' for the given 'only_locale'. If 'only_locale' = "all", processes all locales. Then reads the <input_glossary>, adds new entries that were obtained by the glossary generation if there are any gaps, and then writes the results to <output_glossary>.
    """

    # Find the locale files to process
    print('Locale: ' + only_locale)
    locales = []
    glossaries = defaultdict(list)

    if only_locale != 'all':
        locales.append(only_locale)
    else:
        # Get locales from the Transifex glossary file
        header_row = read_csv_file(input_glossary)[0]
        regex = re.compile('^(translation_)(.+)$')
        for header in header_row:
            match = regex.match(header)
            if match:
                locales.append(match.group(2))

    temp_path = make_path(output_path, 'temp_glossary')

    for locale in locales:
        print('Processing locale: ' + locale)
        # Generate the pot glossary
        input_path = po_dir + '/*/' + locale + '.po'
        pot_path = os.path.join(temp_path, 'glossary_' + locale + '.po')

        try:
            # We need shell=True for the wildcards.
            poterminology_result = check_output(
                ['poterminology ' + input_path + ' -o ' + pot_path],
                stderr=subprocess.STDOUT,
                shell=True)
            if 'Error' in poterminology_result:
                print('Error running poterminology:\n  FILE: ' + input_path +
                      '\n  OUTPUT PATH: ' + output_path + '\n  ' +
                      poterminology_result.split('\n', 1)[1])
                return False

        except CalledProcessError:
            print('Failed to run poterminology:\n  FILE: ' + input_path +
                  '\n  OUTPUT PATH: ' + output_path + '\n  ' +
                  poterminology_result.split('\n', 1)[1])
            return False

        # Convert to csv for easy parsing
        csv_file = os.path.join(temp_path, 'glossary_' + locale + '.csv')
        call(['po2csv', '--progress=none', pot_path, csv_file])
        # The po file is no longer needed, delete it.
        os.remove(pot_path)

        transifex_glossary = load_transifex_glossary(input_glossary, locale)
        extracted_glossary = load_extracted_glossary(csv_file, locale)

        # Add generated translation if necessary
        for key in transifex_glossary.keys():
            if transifex_glossary[
                    key].translation == '' and extracted_glossary.has_key(key):
                extracted_entry = extracted_glossary[key]
                if extracted_entry.translation != '':
                    transifex_entry = transifex_glossary[key]
                    transifex_entry.translation = extracted_entry.translation
                    transifex_entry.translation_comment = 'AUTOGENERATED - PLEASE PROOFREAD!'
                    transifex_glossary[key] = transifex_entry
        glossaries[locale] = transifex_glossary

    # Now collect the date for the global csv file
    # Write header
    print('Writing results to ' + output_glossary)
    result = 'term,pos,comment,'
    for locale in locales:
        result = result + 'translation_' + locale + ','
        result = result + 'comment_' + locale + ','
    result = result[0:-1] + '\n'

    source_terms = load_transifex_source_terms(input_glossary)
    # Collect all translations for each source term
    for key in source_terms:
        result = result + '"%s","%s","%s",' % (source_terms[key].term.replace(
            '"', '""'), source_terms[key].wordclass.replace(
                '"', '""'), source_terms[key].term_comment.replace('"', '""'))
        for locale in locales:
            glossary = glossaries[locale]
            translation = ''
            translation_comment = ''
            if glossary.has_key(key):
                translation = glossary[key].translation.replace('"', '""')
                translation_comment = glossary[
                    key].translation_comment.replace('"', '""')
            result = result + \
                '"%s","%s",' % (translation, translation_comment)
        result = result[0:-1] + '\n'

    # Now write the file.
    with open(output_glossary, 'wt') as dest_file:
        dest_file.write(result)

    # Cleanup.
    delete_path(temp_path)
    if not os.listdir(output_path):
        os.rmdir(output_path)
    print('Done.')
    return 0
Exemplo n.º 10
0
def load_glossary(glossary_file, locale):
    """Build a glossary from the given Transifex glossary csv file for the
    given locale."""
    result = []
    counter = 0
    term_index = 0
    term_comment_index = 0
    wordclass_index = 0
    translation_index = 0
    comment_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'term':
                    term_index = colum_counter
                elif header == 'comment':
                    term_comment_index = colum_counter
                elif header == 'pos':
                    wordclass_index = colum_counter
                elif header == 'translation_' + locale or header == locale:
                    translation_index = colum_counter
                elif header == 'comment_' + locale:
                    comment_index = colum_counter
                colum_counter = colum_counter + 1
        # If there is a translation, parse the entry
        # We also have some obsolete terms in the glossary that we want to
        # filter out.
        elif len(row[translation_index].strip()) > 0 and not row[term_comment_index].startswith('OBSOLETE'):
            if translation_index == 0:
                raise Exception(
                    'Locale %s is missing from glossary file.' % locale)
            if comment_index == 0:
                raise Exception(
                    'Comment field for locale %s is missing from glossary file.' % locale)
            entry = GlossaryEntry()
            entry.terms.append(row[term_index].strip())
            if row[wordclass_index] == 'Noun':
                plural = make_english_plural(entry.terms[0])
                if len(plural) > 0:
                    entry.terms.append(plural)
            elif row[wordclass_index] == 'Verb':
                verb_forms = make_english_verb_forms(entry.terms[0])
                for verb_form in verb_forms:
                    entry.terms.append(verb_form)

            entry.translations.append(row[translation_index].strip())

            # Misuse the comment field to provide a list of inflected forms.
            # Otherwise, we would get tons of false positive hits in the checks
            # later on and the translators would have our heads on a platter.
            delimiter = '|'
            if len(row[comment_index].strip()) > 1 and delimiter in row[comment_index]:
                inflections = row[comment_index].split(delimiter)
                for inflection in inflections:
                    entry.translations.append(inflection.strip())

            result.append(entry)
        counter = counter + 1
    return result
Exemplo n.º 11
0
def generate_glossary(po_dir, output_path, input_glossary, output_glossary, only_locale):
    """Main loop.

    Uses poterminology from the Translate Toolkit to collect glossary entries for all files in 'po_dir' for the given 'only_locale'. If 'only_locale' = "all", processes all locales. Then reads the <input_glossary>, adds new entries that were obtained by the glossary generation if there are any gaps, and then writes the results to <output_glossary>.
    """

    # Find the locale files to process
    print('Locale: ' + only_locale)
    locales = []
    glossaries = defaultdict(list)

    if only_locale != 'all':
        locales.append(only_locale)
    else:
        # Get locales from the Transifex glossary file
        header_row = read_csv_file(input_glossary)[0]
        regex = re.compile('^(translation_)(.+)$')
        for header in header_row:
            match = regex.match(header)
            if match:
                locales.append(match.group(2))

    temp_path = make_path(output_path, 'temp_glossary')

    for locale in locales:
        print('Processing locale: ' + locale)
        # Generate the pot glossary
        input_path = po_dir + '/*/' + locale + '.po'
        pot_path = os.path.join(temp_path, 'glossary_' + locale + '.po')

        try:
            # We need shell=True for the wildcards.
            poterminology_result = check_output(
                ['poterminology ' + input_path + ' -o ' + pot_path], stderr=subprocess.STDOUT, shell=True)
            if 'Error' in poterminology_result:
                print('Error running poterminology:\n  FILE: ' + input_path + '\n  OUTPUT PATH: ' +
                      output_path + '\n  ' + poterminology_result.split('\n', 1)[1])
                return False

        except CalledProcessError:
            print('Failed to run poterminology:\n  FILE: ' + input_path + '\n  OUTPUT PATH: ' +
                  output_path + '\n  ' + poterminology_result.split('\n', 1)[1])
            return False

        # Convert to csv for easy parsing
        csv_file = os.path.join(temp_path, 'glossary_' + locale + '.csv')
        call(['po2csv', '--progress=none', pot_path, csv_file])
        # The po file is no longer needed, delete it.
        os.remove(pot_path)

        transifex_glossary = load_transifex_glossary(input_glossary, locale)
        extracted_glossary = load_extracted_glossary(csv_file, locale)

        # Add generated translation if necessary
        for key in transifex_glossary.keys():
            if transifex_glossary[key].translation == '' and extracted_glossary.has_key(key):
                extracted_entry = extracted_glossary[key]
                if extracted_entry.translation != '':
                    transifex_entry = transifex_glossary[key]
                    transifex_entry.translation = extracted_entry.translation
                    transifex_entry.translation_comment = 'AUTOGENERATED - PLEASE PROOFREAD!'
                    transifex_glossary[key] = transifex_entry
        glossaries[locale] = transifex_glossary

    # Now collect the date for the global csv file
    # Write header
    print('Writing results to ' + output_glossary)
    result = 'term,pos,comment,'
    for locale in locales:
        result = result + 'translation_' + locale + ','
        result = result + 'comment_' + locale + ','
    result = result[0:-1] + '\n'

    source_terms = load_transifex_source_terms(input_glossary)
    # Collect all translations for each source term
    for key in source_terms:
        result = result + '"%s","%s","%s",' % (source_terms[key].term.replace('"', '""'), source_terms[
                                               key].wordclass.replace('"', '""'), source_terms[key].term_comment.replace('"', '""'))
        for locale in locales:
            glossary = glossaries[locale]
            translation = ''
            translation_comment = ''
            if glossary.has_key(key):
                translation = glossary[key].translation.replace('"', '""')
                translation_comment = glossary[
                    key].translation_comment.replace('"', '""')
            result = result + \
                '"%s","%s",' % (translation, translation_comment)
        result = result[0:-1] + '\n'

    # Now write the file.
    with open(output_glossary, 'wt') as dest_file:
        dest_file.write(result)

    # Cleanup.
    delete_path(temp_path)
    if not os.listdir(output_path):
        os.rmdir(output_path)
    print('Done.')
    return 0
Exemplo n.º 12
0
def load_glossary(glossary_file, locale):
    """Build a glossary from the given Transifex glossary csv file for the
    given locale."""
    result = []
    counter = 0
    term_index = 0
    term_comment_index = 0
    wordclass_index = 0
    translation_index = 0
    comment_index = 0
    for row in read_csv_file(glossary_file):
        # Detect the column indices
        if counter == 0:
            colum_counter = 0
            for header in row:
                if header == 'term':
                    term_index = colum_counter
                elif header == 'comment':
                    term_comment_index = colum_counter
                elif header == 'pos':
                    wordclass_index = colum_counter
                elif header in ('translation_' + locale, locale):
                    translation_index = colum_counter
                elif header == 'comment_' + locale:
                    comment_index = colum_counter
                colum_counter = colum_counter + 1
        # If there is a translation, parse the entry
        # We also have some obsolete terms in the glossary that we want to
        # filter out.
        elif len(row[translation_index].strip()
                 ) > 0 and not row[term_comment_index].startswith('OBSOLETE'):
            if translation_index == 0:
                raise Exception('Locale %s is missing from glossary file.' %
                                locale)
            if comment_index == 0:
                raise Exception(
                    'Comment field for locale %s is missing from glossary file.'
                    % locale)
            entry = GlossaryEntry()
            entry.terms.append(row[term_index].strip())
            if row[wordclass_index] == 'Noun':
                plural = make_english_plural(entry.terms[0])
                if len(plural) > 0:
                    entry.terms.append(plural)
            elif row[wordclass_index] == 'Verb':
                verb_forms = make_english_verb_forms(entry.terms[0])
                for verb_form in verb_forms:
                    entry.terms.append(verb_form)

            entry.translations.append(row[translation_index].strip())

            # Misuse the comment field to provide a list of inflected forms.
            # Otherwise, we would get tons of false positive hits in the checks
            # later on and the translators would have our heads on a platter.
            delimiter = '|'
            if len(row[comment_index].strip()
                   ) > 1 and delimiter in row[comment_index]:
                inflections = row[comment_index].split(delimiter)
                for inflection in inflections:
                    entry.translations.append(inflection.strip())

            result.append(entry)
        counter = counter + 1
    return result