def load_transifex_source_terms(glossary_file): """Loads a list of source terms with their comments and word classes as a defaultdict(GlossaryEntry) from the given Transifex glossary csv file.""" result = defaultdict(GlossaryEntry) counter = 0 term_index = 0 term_comment_index = 0 wordclass_index = 0 for row in read_csv_file(glossary_file): # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'term': term_index = colum_counter elif header == 'comment': term_comment_index = colum_counter elif header == 'pos': wordclass_index = colum_counter colum_counter = colum_counter + 1 # Parse the entry else: entry = GlossaryEntry() entry.term = row[term_index].strip() entry.term_comment = row[term_comment_index].strip() entry.wordclass = row[wordclass_index].strip() result[entry.term] = entry counter = counter + 1 return result
def load_extracted_glossary(glossary_file, locale): """Build a defaultdict(GlossaryEntry) glossary from the given extracted glossary csv file for the given locale, raising an error for entries that have no translation.""" result = defaultdict(GlossaryEntry) counter = 0 term_index = 0 comment_index = 0 for row in read_csv_file(glossary_file): # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'source': term_index = colum_counter elif header == 'target': translation_index = colum_counter colum_counter = colum_counter + 1 # If there is a translation, parse the entry elif row[translation_index].strip() != '': if translation_index == 0: raise Exception( 'Glossary extracted for %s contains no translations.' % locale) entry = GlossaryEntry() entry.term = row[term_index].strip() entry.translation = row[translation_index].strip() # Remove source information with fuzzy matches regex = re.compile('(.+)( \{.*\})(.*)') match = regex.match(entry.translation) while match: entry.translation = match.group(1) + match.group(3) match = regex.match(entry.translation) result[entry.term] = entry counter = counter + 1 return result
def load_extracted_glossary(glossary_file, locale): """Build a defaultdict(GlossaryEntry) glossary from the given extracted glossary csv file for the given locale, raising an error for entries that have no translation.""" result = defaultdict(GlossaryEntry) counter = 0 term_index = 0 for row in read_csv_file(glossary_file): # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'source': term_index = colum_counter elif header == 'target': translation_index = colum_counter colum_counter = colum_counter + 1 # If there is a translation, parse the entry elif row[translation_index].strip() != '': if translation_index == 0: raise Exception( 'Glossary extracted for %s contains no translations.' % locale) entry = GlossaryEntry() entry.term = row[term_index].strip() entry.translation = row[translation_index].strip() # Remove source information with fuzzy matches regex = re.compile(r'(.+)( \{.*\})(.*)') match = regex.match(entry.translation) while match: entry.translation = match.group(1) + match.group(3) match = regex.match(entry.translation) result[entry.term] = entry counter = counter + 1 return result
def check_file(csv_file, glossaries, locale, po_file): """Run the actual check.""" translations = read_csv_file(csv_file) source_index = 0 target_index = 0 location_index = 0 hits = [] counter = 0 has_hunspell = True hunspell_locale = get_hunspell_locale(locale) for row in translations: # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'source': source_index = colum_counter elif header == 'target': target_index = colum_counter elif header == 'location': location_index = colum_counter colum_counter = colum_counter + 1 else: for entry in glossaries[locale][0]: # Check if the source text contains the glossary term. # Filter out superstrings, e.g. we don't want to check # "arena" against "battle arena" if source_contains_term(row[source_index], entry, glossaries[locale][0]): # Skip empty translations if row[target_index] == '': continue # Now verify the translation against all translation # variations from the glossary term_found = translation_has_term(entry, row[target_index]) # Add Hunspell stems for better matches and try again # We do it here because the Hunspell manipulation is slow. if not term_found and hunspell_locale != '': target_to_check = append_hunspell_stems( hunspell_locale, row[target_index]) term_found = translation_has_term( entry, target_to_check) if not term_found: hit = FailedTranslation() hit.source = row[source_index] hit.target = row[target_index] hit.location = row[location_index] hit.term = entry.terms[0] hit.translation = entry.translations[0] hit.locale = locale hit.po_file = po_file hits.append(hit) counter = counter + 1 return hits
def load_transifex_glossary(glossary_file, locale): """Build a defaultdict(GlossaryEntry) glossary from the given Transifex glossary csv file for the given locale. Include empty translations in the result """ result = defaultdict(GlossaryEntry) counter = 0 term_index = 0 term_comment_index = 0 translation_index = 0 comment_index = 0 for row in read_csv_file(glossary_file): # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'term': term_index = colum_counter elif header == 'comment': term_comment_index = colum_counter elif header == 'translation_' + locale or header == locale: translation_index = colum_counter elif header == 'comment_' + locale: comment_index = colum_counter colum_counter = colum_counter + 1 # Parse the entry else: if translation_index == 0: raise Exception('Locale %s is missing from glossary file.' % locale) if comment_index == 0: raise Exception( 'Comment field for locale %s is missing from glossary file.' % locale) entry = GlossaryEntry() entry.term = row[term_index].strip() entry.term_comment = row[term_comment_index].strip() entry.translation = row[translation_index].strip() entry.translation_comment = row[comment_index].strip() result[entry.term] = entry counter = counter + 1 return result
def load_transifex_glossary(glossary_file, locale): """Build a defaultdict(GlossaryEntry) glossary from the given Transifex glossary csv file for the given locale. Include empty translations in the result """ result = defaultdict(GlossaryEntry) counter = 0 term_index = 0 term_comment_index = 0 translation_index = 0 comment_index = 0 for row in read_csv_file(glossary_file): # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'term': term_index = colum_counter elif header == 'comment': term_comment_index = colum_counter elif header == 'translation_' + locale or header == locale: translation_index = colum_counter elif header == 'comment_' + locale: comment_index = colum_counter colum_counter = colum_counter + 1 # Parse the entry else: if translation_index == 0: raise Exception( 'Locale %s is missing from glossary file.' % locale) if comment_index == 0: raise Exception( 'Comment field for locale %s is missing from glossary file.' % locale) entry = GlossaryEntry() entry.term = row[term_index].strip() entry.term_comment = row[term_comment_index].strip() entry.translation = row[translation_index].strip() entry.translation_comment = row[comment_index].strip() result[entry.term] = entry counter = counter + 1 return result
def generate_glossary(po_dir, output_path, input_glossary, output_glossary, only_locale): """Main loop. Uses poterminology from the Translate Toolkit to collect glossary entries for all files in 'po_dir' for the given 'only_locale'. If 'only_locale' = "all", processes all locales. Then reads the <input_glossary>, adds new entries that were obtained by the glossary generation if there are any gaps, and then writes the results to <output_glossary>. """ # Find the locale files to process print('Locale: ' + only_locale) locales = [] glossaries = defaultdict(list) if only_locale != 'all': locales.append(only_locale) else: # Get locales from the Transifex glossary file header_row = read_csv_file(input_glossary)[0] regex = re.compile('^(translation_)(.+)$') for header in header_row: match = regex.match(header) if match: locales.append(match.group(2)) temp_path = make_path(output_path, 'temp_glossary') for locale in locales: print('Processing locale: ' + locale) # Generate the pot glossary input_path = po_dir + '/*/' + locale + '.po' pot_path = os.path.join(temp_path, 'glossary_' + locale + '.po') try: # We need shell=True for the wildcards. poterminology_result = check_output( ['poterminology ' + input_path + ' -o ' + pot_path], stderr=subprocess.STDOUT, shell=True) if 'Error' in poterminology_result: print('Error running poterminology:\n FILE: ' + input_path + '\n OUTPUT PATH: ' + output_path + '\n ' + poterminology_result.split('\n', 1)[1]) return False except CalledProcessError: print('Failed to run poterminology:\n FILE: ' + input_path + '\n OUTPUT PATH: ' + output_path + '\n ' + poterminology_result.split('\n', 1)[1]) return False # Convert to csv for easy parsing csv_file = os.path.join(temp_path, 'glossary_' + locale + '.csv') call(['po2csv', '--progress=none', pot_path, csv_file]) # The po file is no longer needed, delete it. os.remove(pot_path) transifex_glossary = load_transifex_glossary(input_glossary, locale) extracted_glossary = load_extracted_glossary(csv_file, locale) # Add generated translation if necessary for key in transifex_glossary.keys(): if transifex_glossary[ key].translation == '' and extracted_glossary.has_key(key): extracted_entry = extracted_glossary[key] if extracted_entry.translation != '': transifex_entry = transifex_glossary[key] transifex_entry.translation = extracted_entry.translation transifex_entry.translation_comment = 'AUTOGENERATED - PLEASE PROOFREAD!' transifex_glossary[key] = transifex_entry glossaries[locale] = transifex_glossary # Now collect the date for the global csv file # Write header print('Writing results to ' + output_glossary) result = 'term,pos,comment,' for locale in locales: result = result + 'translation_' + locale + ',' result = result + 'comment_' + locale + ',' result = result[0:-1] + '\n' source_terms = load_transifex_source_terms(input_glossary) # Collect all translations for each source term for key in source_terms: result = result + '"%s","%s","%s",' % (source_terms[key].term.replace( '"', '""'), source_terms[key].wordclass.replace( '"', '""'), source_terms[key].term_comment.replace('"', '""')) for locale in locales: glossary = glossaries[locale] translation = '' translation_comment = '' if glossary.has_key(key): translation = glossary[key].translation.replace('"', '""') translation_comment = glossary[ key].translation_comment.replace('"', '""') result = result + \ '"%s","%s",' % (translation, translation_comment) result = result[0:-1] + '\n' # Now write the file. with open(output_glossary, 'wt') as dest_file: dest_file.write(result) # Cleanup. delete_path(temp_path) if not os.listdir(output_path): os.rmdir(output_path) print('Done.') return 0
def load_glossary(glossary_file, locale): """Build a glossary from the given Transifex glossary csv file for the given locale.""" result = [] counter = 0 term_index = 0 term_comment_index = 0 wordclass_index = 0 translation_index = 0 comment_index = 0 for row in read_csv_file(glossary_file): # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'term': term_index = colum_counter elif header == 'comment': term_comment_index = colum_counter elif header == 'pos': wordclass_index = colum_counter elif header == 'translation_' + locale or header == locale: translation_index = colum_counter elif header == 'comment_' + locale: comment_index = colum_counter colum_counter = colum_counter + 1 # If there is a translation, parse the entry # We also have some obsolete terms in the glossary that we want to # filter out. elif len(row[translation_index].strip()) > 0 and not row[term_comment_index].startswith('OBSOLETE'): if translation_index == 0: raise Exception( 'Locale %s is missing from glossary file.' % locale) if comment_index == 0: raise Exception( 'Comment field for locale %s is missing from glossary file.' % locale) entry = GlossaryEntry() entry.terms.append(row[term_index].strip()) if row[wordclass_index] == 'Noun': plural = make_english_plural(entry.terms[0]) if len(plural) > 0: entry.terms.append(plural) elif row[wordclass_index] == 'Verb': verb_forms = make_english_verb_forms(entry.terms[0]) for verb_form in verb_forms: entry.terms.append(verb_form) entry.translations.append(row[translation_index].strip()) # Misuse the comment field to provide a list of inflected forms. # Otherwise, we would get tons of false positive hits in the checks # later on and the translators would have our heads on a platter. delimiter = '|' if len(row[comment_index].strip()) > 1 and delimiter in row[comment_index]: inflections = row[comment_index].split(delimiter) for inflection in inflections: entry.translations.append(inflection.strip()) result.append(entry) counter = counter + 1 return result
def generate_glossary(po_dir, output_path, input_glossary, output_glossary, only_locale): """Main loop. Uses poterminology from the Translate Toolkit to collect glossary entries for all files in 'po_dir' for the given 'only_locale'. If 'only_locale' = "all", processes all locales. Then reads the <input_glossary>, adds new entries that were obtained by the glossary generation if there are any gaps, and then writes the results to <output_glossary>. """ # Find the locale files to process print('Locale: ' + only_locale) locales = [] glossaries = defaultdict(list) if only_locale != 'all': locales.append(only_locale) else: # Get locales from the Transifex glossary file header_row = read_csv_file(input_glossary)[0] regex = re.compile('^(translation_)(.+)$') for header in header_row: match = regex.match(header) if match: locales.append(match.group(2)) temp_path = make_path(output_path, 'temp_glossary') for locale in locales: print('Processing locale: ' + locale) # Generate the pot glossary input_path = po_dir + '/*/' + locale + '.po' pot_path = os.path.join(temp_path, 'glossary_' + locale + '.po') try: # We need shell=True for the wildcards. poterminology_result = check_output( ['poterminology ' + input_path + ' -o ' + pot_path], stderr=subprocess.STDOUT, shell=True) if 'Error' in poterminology_result: print('Error running poterminology:\n FILE: ' + input_path + '\n OUTPUT PATH: ' + output_path + '\n ' + poterminology_result.split('\n', 1)[1]) return False except CalledProcessError: print('Failed to run poterminology:\n FILE: ' + input_path + '\n OUTPUT PATH: ' + output_path + '\n ' + poterminology_result.split('\n', 1)[1]) return False # Convert to csv for easy parsing csv_file = os.path.join(temp_path, 'glossary_' + locale + '.csv') call(['po2csv', '--progress=none', pot_path, csv_file]) # The po file is no longer needed, delete it. os.remove(pot_path) transifex_glossary = load_transifex_glossary(input_glossary, locale) extracted_glossary = load_extracted_glossary(csv_file, locale) # Add generated translation if necessary for key in transifex_glossary.keys(): if transifex_glossary[key].translation == '' and extracted_glossary.has_key(key): extracted_entry = extracted_glossary[key] if extracted_entry.translation != '': transifex_entry = transifex_glossary[key] transifex_entry.translation = extracted_entry.translation transifex_entry.translation_comment = 'AUTOGENERATED - PLEASE PROOFREAD!' transifex_glossary[key] = transifex_entry glossaries[locale] = transifex_glossary # Now collect the date for the global csv file # Write header print('Writing results to ' + output_glossary) result = 'term,pos,comment,' for locale in locales: result = result + 'translation_' + locale + ',' result = result + 'comment_' + locale + ',' result = result[0:-1] + '\n' source_terms = load_transifex_source_terms(input_glossary) # Collect all translations for each source term for key in source_terms: result = result + '"%s","%s","%s",' % (source_terms[key].term.replace('"', '""'), source_terms[ key].wordclass.replace('"', '""'), source_terms[key].term_comment.replace('"', '""')) for locale in locales: glossary = glossaries[locale] translation = '' translation_comment = '' if glossary.has_key(key): translation = glossary[key].translation.replace('"', '""') translation_comment = glossary[ key].translation_comment.replace('"', '""') result = result + \ '"%s","%s",' % (translation, translation_comment) result = result[0:-1] + '\n' # Now write the file. with open(output_glossary, 'wt') as dest_file: dest_file.write(result) # Cleanup. delete_path(temp_path) if not os.listdir(output_path): os.rmdir(output_path) print('Done.') return 0
def load_glossary(glossary_file, locale): """Build a glossary from the given Transifex glossary csv file for the given locale.""" result = [] counter = 0 term_index = 0 term_comment_index = 0 wordclass_index = 0 translation_index = 0 comment_index = 0 for row in read_csv_file(glossary_file): # Detect the column indices if counter == 0: colum_counter = 0 for header in row: if header == 'term': term_index = colum_counter elif header == 'comment': term_comment_index = colum_counter elif header == 'pos': wordclass_index = colum_counter elif header in ('translation_' + locale, locale): translation_index = colum_counter elif header == 'comment_' + locale: comment_index = colum_counter colum_counter = colum_counter + 1 # If there is a translation, parse the entry # We also have some obsolete terms in the glossary that we want to # filter out. elif len(row[translation_index].strip() ) > 0 and not row[term_comment_index].startswith('OBSOLETE'): if translation_index == 0: raise Exception('Locale %s is missing from glossary file.' % locale) if comment_index == 0: raise Exception( 'Comment field for locale %s is missing from glossary file.' % locale) entry = GlossaryEntry() entry.terms.append(row[term_index].strip()) if row[wordclass_index] == 'Noun': plural = make_english_plural(entry.terms[0]) if len(plural) > 0: entry.terms.append(plural) elif row[wordclass_index] == 'Verb': verb_forms = make_english_verb_forms(entry.terms[0]) for verb_form in verb_forms: entry.terms.append(verb_form) entry.translations.append(row[translation_index].strip()) # Misuse the comment field to provide a list of inflected forms. # Otherwise, we would get tons of false positive hits in the checks # later on and the translators would have our heads on a platter. delimiter = '|' if len(row[comment_index].strip() ) > 1 and delimiter in row[comment_index]: inflections = row[comment_index].split(delimiter) for inflection in inflections: entry.translations.append(inflection.strip()) result.append(entry) counter = counter + 1 return result