def test_addtranslation(self): """tests that addtranslation() stores strings correctly""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("A string of characters", "en", "'n String karakters", "af") newfile = self.tmxparse(str(tmxfile)) print(str(tmxfile)) assert newfile.translate("A string of characters") == "'n String karakters"
def test_translate(self): tmxfile = tmx.tmxfile() assert tmxfile.translate("Anything") is None tmxfile.addtranslation("A string of characters", "en", "'n String karakters", "af") assert tmxfile.translate( "A string of characters") == "'n String karakters"
def export(self, rotate=False): source_language = self.context.project.source_language.code target_language = self.context.language.code if not os.path.exists(self.directory): os.makedirs(self.directory) tmxfile = tmx.tmxfile() for store in self.context.stores.live().iterator(): for unit in store.units.filter(state=TRANSLATED): tmxfile.addtranslation(unit.source, source_language, unit.target, target_language, unit.developer_comment) bs = BytesIO() tmxfile.serialize(bs) with open(self.abs_filepath, "wb") as f: with ZipFile(f, "w") as zf: zf.writestr(self.filename.rstrip('.zip'), bs.getvalue()) last_exported_filepath = self.last_exported_file_path self.update_exported_revision() removed = [] if rotate: for fn in os.listdir(self.directory): # Skip files from other projects. if not self.check_tp(fn): continue filepath = os.path.join(self.directory, fn) if filepath not in [self.abs_filepath, last_exported_filepath]: removed.append(filepath) os.remove(filepath) return self.abs_filepath, removed
def export_BilingualCorpus2File(file_url, sentences, file_type, s_lang, t_lang): if file_type == 'txt': with open(file_url, 'w', encoding='utf-8') as fout: for sentence in sentences: fout.write(sentence.source + '|' + sentence.target + '\n') fout.close() elif file_type == 'csv': with open(file_url, 'a', encoding='utf-8') as csvfile: writeCSV = csv.writer(csvfile, delimiter=',', dialect='excel') for sentence in sentences: writeCSV.writerow([sentence.source, sentence.target]) csvfile.close() elif file_type == 'tmx': tmx_file = tmxfile() for sentence in sentences: tmx_file.addtranslation(sentence.source, s_lang, sentence.target, t_lang) tmx_file.savefile(file_url) elif file_type == 'xlsx': dst_wb = openpyxl.Workbook() ss_sheet = dst_wb['Sheet'] ss_sheet.title = 'transmem' dst_wb.save(file_url) dst_wb = openpyxl.load_workbook(file_url) dst_ws = dst_wb['transmem'] row = 1 for sentence in sentences: dst_ws.cell(row, 1).value = sentence.source dst_ws.cell(row, 2).value = sentence.target row += 1 dst_wb.save(file_url) else: pass return os.path.basename(file_url)
def test_withnewlines(self): """test addtranslation() with newlines""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("First line\nSecond line", "en", "Eerste lyn\nTweede lyn", "af") newfile = self.tmxparse(bytes(tmxfile)) print(bytes(tmxfile)) assert newfile.translate("First line\nSecond line") == "Eerste lyn\nTweede lyn"
def test_withnewlines(self): """test addtranslation() with newlines""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("First line\nSecond line", "en", "Eerste lyn\nTweede lyn", "af") newfile = self.tmxparse(str(tmxfile)) print(str(tmxfile)) assert newfile.translate("First line\nSecond line") == "Eerste lyn\nTweede lyn"
def po2tmx(self, posource, sourcelanguage='en', targetlanguage='af'): """helper that converts po source to tmx source without requiring files""" inputfile = wStringIO.StringIO(posource) outputfile = wStringIO.StringIO() outputfile.tmxfile = tmx.tmxfile(inputfile=None, sourcelanguage=sourcelanguage) po2tmx.convertpo(inputfile, outputfile, templatefile=None, sourcelanguage=sourcelanguage, targetlanguage=targetlanguage) return outputfile.tmxfile
def test_addtranslation(self): """tests that addtranslation() stores strings correctly""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("A string of characters", "en", "'n String karakters", "af") newfile = self.tmxparse(bytes(tmxfile)) print(bytes(tmxfile)) assert newfile.translate("A string of characters") == "'n String karakters"
def split(f_input, orig_lang, dest_lang, f_output, num_entries): with open(f_input, 'rb') as fd: tmx_file = tmxfile(fd, orig_lang, dest_lang) postfix = "" if len(tmx_file.units) < num_entries else "1" for entries in chunks(tmx_file.units, num_entries): print(len(entries)) generate_po_from_tmx(f_output + postfix + ".po", entries) postfix = str(int("0" + postfix) + 1)
def convert_Text2Tmx(self, src, dst): tmx_file = tmxfile() lines = open(src, encoding='utf-8').read().strip().split('\n') for line in lines: s = line.split(delimiter) tmx_file.addtranslation(s[0], "en", s[1], "th") tmx_file.savefile(dst) QMessageBox.information(self, "Information", "Converting was done successfully")
def convert_Tmx2Text(self, src, dst): dst_file = open(dst, 'w', encoding='utf-8') with open(src, 'rb') as fin: tmx_file = tmxfile(fin, 'en', 'th') for node in tmx_file.unit_iter(): dst_file.write(node.getsource() + delimiter + node.gettarget() + '\n') dst_file.close() QMessageBox.information(self, "Information", "Converting was done successfully")
def test_withcomment(self): """tests that addtranslation() stores string's comments correctly""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("A string of chars", "en", "'n String karakters", "af", "comment") newfile = self.tmxparse(str(tmxfile)) print(str(tmxfile)) assert newfile.findunit("A string of chars").getnotes() == "comment"
def test_withcomment(self): """tests that addtranslation() stores string's comments correctly""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("A string of chars", "en", "'n String karakters", "af", "comment") newfile = self.tmxparse(str(tmxfile)) print str(tmxfile) assert newfile.findunit("A string of chars").getnotes() == "comment"
def convert_Xliff2Tmx(self, src, dst): fin = open(src, 'r', encoding = "utf-8") data = fin.read() xliff_file = xlifffile.parsestring(data) tmx_file = tmxfile() for node in xliff_file.unit_iter(): tmx_file.addtranslation(node.source, "en", node.target, "th") tmx_file.savefile(dst) QMessageBox.information(self, "Information", "Converting was done successfully")
def po2tmx(self, posource, sourcelanguage='en', targetlanguage='af', comment=None): """helper that converts po source to tmx source without requiring files""" inputfile = BytesIO(posource.encode('utf-8')) outputfile = BytesIO() outputfile.tmxfile = tmx.tmxfile(inputfile=None, sourcelanguage=sourcelanguage) po2tmx.convertpo(inputfile, outputfile, templatefile=None, sourcelanguage=sourcelanguage, targetlanguage=targetlanguage, comment=comment) return outputfile.tmxfile
def test_controls_cleaning(self): """test addtranslation() with control chars""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("Client Version:\x0314 %s", "en", "test one", "ar") tmxfile.addtranslation("Client Version:\n%s", "en", "test two", "ar") newfile = self.tmxparse(bytes(tmxfile)) print(bytes(tmxfile)) assert newfile.translate("Client Version:14 %s") == "test one" assert newfile.translate("Client Version:\n%s") == "test two"
def get_parallel_corpus(): multi_lingual_sentences = [] location = os.path.join(os.path.dirname(os.path.realpath(__file__)), WIKIPEDIA_FILE) with open(location, 'rb') as fin: tmx_file = tmxfile(fin, 'en', 'he') for node in tmx_file.unit_iter(): sentence = MultiLingualSentence(node.gettarget(), node.getsource()) multi_lingual_sentences.append(sentence) return multi_lingual_sentences
def __init__(self, filename, mode=None): """initialises tmxmultifile from a seekable inputfile or writable outputfile""" self.filename = filename if mode is None: if os.path.exists(filename): mode = 'r' else: mode = 'w' self.mode = mode # self.multifilestyle = multifilestyle self.multifilename = os.path.splitext(filename)[0] # self.multifile = open(filename, mode) self.tmxfile = tmx.tmxfile()
def test_xmlentities(self): """Test that the xml entities '&' and '<' are escaped correctly""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("Mail & News", "en", "Nuus & pos", "af") tmxfile.addtranslation("Five < ten", "en", "Vyf < tien", "af") xmltext = bytes(tmxfile).decode("utf-8") print("The generated xml:") print(xmltext) assert tmxfile.translate("Mail & News") == "Nuus & pos" assert xmltext.index("Mail & News") assert xmltext.find("Mail & News") == -1 assert tmxfile.translate("Five < ten") == "Vyf < tien" assert xmltext.index("Five < ten") assert xmltext.find("Five < ten") == -1
def test_xmlentities(self): """Test that the xml entities '&' and '<' are escaped correctly""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("Mail & News", "en", "Nuus & pos", "af") tmxfile.addtranslation("Five < ten", "en", "Vyf < tien", "af") xmltext = str(tmxfile) print("The generated xml:") print(xmltext) assert tmxfile.translate('Mail & News') == 'Nuus & pos' assert xmltext.index('Mail & News') assert xmltext.find('Mail & News') == -1 assert tmxfile.translate('Five < ten') == 'Vyf < tien' assert xmltext.index('Five < ten') assert xmltext.find('Five < ten') == -1
def load_tmx_file(file, source_language=None, target_language=None): """ Loads the tmx file :param file: The tmx memory file to open :param source_language: The source language we are translating :param target_language: The target language we are translating to :return: The tmx file XML file as a translation.storage.tmx object """ with open(file, 'rb') as tmx: tmx_file = tmxfile(tmx, 'en-GB', 'fr-FR') # TODO This does not affect what is loaded return tmx_file
def test_xmlentities(self): """Test that the xml entities '&' and '<' are escaped correctly""" tmxfile = tmx.tmxfile() tmxfile.addtranslation("Mail & News", "en", "Nuus & pos", "af") tmxfile.addtranslation("Five < ten", "en", "Vyf < tien", "af") xmltext = str(tmxfile) print "The generated xml:" print xmltext assert tmxfile.translate('Mail & News') == 'Nuus & pos' assert xmltext.index('Mail & News') assert xmltext.find('Mail & News') == -1 assert tmxfile.translate('Five < ten') == 'Vyf < tien' assert xmltext.index('Five < ten') assert xmltext.find('Five < ten') == -1
def test_load_tmx_file(self, source_language=None, target_language=None): """ Loads the tmx file :param file: The tmx memory file to open :param source_language: The source language we are translating :param target_language: The target language we are translating to :return: The tmx file XML file as a translation.storage.tmx object """ file = 'Tests/Data/en_es.tmx' with open(file, 'rb') as tmx: tmx_file = tmxfile( tmx, 'en-GB', 'es-ES') # TODO This does not affect what is loaded unit_zero = "CONVENTION ON A COMMON TRANSIT PROCEDURE" assert str(tmx_file.getunits()[0].getid()) == unit_zero
def convert_Excel2Tmx(self, src, dst): src_wb = openpyxl.load_workbook(src) src_ws = src_wb.worksheets[0] en_col = 1 th_col = 2 for col in range(1,src_ws.max_column): cell_value = src_ws.cell(1, col).value.lower() if 'en' == cell_value: en_col = col if 'th' == cell_value: th_col = col tmx_file = tmxfile() for row in range(2, src_ws.max_row+1): tmx_file.addtranslation(src_ws.cell(row, en_col).value, "en", src_ws.cell(row, th_col).value, "th") tmx_file.savefile(dst) QMessageBox.information(self, "Information", "Converting was done successfully")
def concordance_search(tm_objects, searchCon, matchRate, search_lang): # normalized_levenshtein = NormalizedLevenshtein() out_sequences = [] q_tokens = removeStopwords(searchCon).split() for tm_object in tm_objects: tm_url = os.path.join(settings.MEDIA_ROOT, getattr(tm_object, 'file_url').name) tm_s_lang = getattr(tm_object, 's_lang') tm_t_lang = getattr(tm_object, 't_lang') tm_name = getattr(tm_object, 'name') if os.path.isfile(tm_url): fin = open(tm_url, 'rb') tmx_file = tmxfile(fin, tm_s_lang, tm_t_lang) for node in tmx_file.unit_iter(): sequence = node.getsource() s_tokens = removeStopwords(sequence).split() average_rate = 0 index_list = [] ordering = False for q_token in q_tokens: q_index = s_tokens.index( q_token) if q_token in s_tokens else -1 if q_index == -1: matched = difflib.get_close_matches(q_token, s_tokens, n=1, cutoff=0.85) if len(matched) > 0: average_rate += float( textdistance.ratcliff_obershelp( q_token, matched[0])) else: average_rate += 1 index_list.append([q_token, q_index]) average_rate = int(average_rate / max(len(s_tokens), len(q_tokens)) * 100) if average_rate >= matchRate: out_sequences.append({ 'source': sequence, 'target': node.gettarget(), 'tm_name': tm_name, 'match_rate': average_rate }) out_sequences.sort(key=compare_matchrate, reverse=True) return out_sequences
def po2tmx(posource, sourcelanguage="en", targetlanguage="af", comment=None): """helper that converts po source to tmx source without requiring files""" inputfile = BytesIO(posource.encode("utf-8")) outputfile = BytesIO() outputfile.tmxfile = tmx.tmxfile(inputfile=None, sourcelanguage=sourcelanguage) po2tmx.convertpo( inputfile, outputfile, templatefile=None, sourcelanguage=sourcelanguage, targetlanguage=targetlanguage, comment=comment, ) return outputfile.tmxfile
def convert_Tmx2Xliff(self, src, dst): xliff_file = xlifffile() xliff_file.setsourcelanguage('en') xliff_file.settargetlanguage('th') with open(src, 'rb') as fin: tmx_file = tmxfile(fin, 'en', 'th') for node in tmx_file.unit_iter(): new_node = xliffunit(node.getsource()) new_node.settarget(node.gettarget()) xliff_file.addunit(new_node) xliff_file.savefile(dst) fin = open(dst, "r", encoding='utf-8') data = fin.read() fin.close() data = data.replace('<xliff xmlns="urn:oasis:names:tc:xliff:document:1.1" version="1.1">', '<xliff xmlns="urn:oasis:names:tc:xliff:document:1.2" version="1.2">') fout = open(dst, 'w', encoding='utf-8') fout.write(data) fout.close() QMessageBox.information(self, "Information", "Converting was done successfully")
def convert_Tmx2Excel(self, src, dst): if not os.path.exists(dst): dst_wb = openpyxl.Workbook() ss_sheet = dst_wb['Sheet'] ss_sheet.title = 'transmem' dst_wb.save(dst) dst_wb = openpyxl.load_workbook(dst) dst_ws = dst_wb['transmem'] lines = open(src, encoding='utf-8').read().strip().split('\n') dst_ws.cell(1, 1).value = 'en' dst_ws.cell(1, 2).value = 'th' with open(src, 'rb') as fin: tmx_file = tmxfile(fin, 'en', 'th') row = 2 for node in tmx_file.unit_iter(): dst_ws.cell(row, 1).value = node.getsource() dst_ws.cell(row, 2).value = node.gettarget() row += 1 dst_wb.save(dst) QMessageBox.information(self, "Information", "Converting was done successfully")
def get_translation(translation_file): ''' Function to extract translation from a user-specified tmx file. ''' try: with open(translation_file, 'rb') as file: tmx_file = tmxfile(file) except FileNotFoundError as fnf_error: print(fnf_error) sys.exit() else: translation = [] # List of Segment objects for node in tmx_file.unit_iter(): source_text = node.source target_text = node.target segment = Segment(source_text, target_text, {}, {}) translation.append(segment) return translation
def load_sentence(*typ): files = { 'talks': [ '日常口语_20190906111009_1.tmx', '日常口语_20190906111009_2.tmx', '日常口语_20190906111009_3.tmx' ], 'dictexams': [ '词典例句汇集1.tmx', '词典例句汇集3.tmx', '词典例句汇集5.tmx', '词典例句汇集7.tmx', '词典例句汇集2.tmx', '词典例句汇集4.tmx', '词典例句汇集6.tmx', '词典例句汇集8.tmx' ] } iters = {} for t, fs in files.items(): if len(typ) == 0 or t in typ: type_iterns = [] for fname in fs: with open(data_file(fname), 'rb') as fin: tmx = tmxfile(fin, 'en', 'cn') type_iterns.append(tmx.unit_iter()) iters[t] = chain(*type_iterns) return iters
def export(self): source_language = self.context.project.source_language.code target_language = self.context.language.code if not os.path.exists(self.directory): os.makedirs(self.directory) tmxfile = tmx.tmxfile() for store in self.context.stores.live().iterator(): for unit in store.units.filter(state=TRANSLATED): tmxfile.addtranslation(unit.source, source_language, unit.target, target_language, unit.developer_comment) bs = BytesIO() tmxfile.serialize(bs) with open(self.abs_filepath, "wb") as f: with ZipFile(f, "w") as zf: zf.writestr(self.filename, bs.getvalue()) self.update_exported_revision() return self.abs_filepath
def tmxparse(self, tmxsource): """helper that parses tmx source without requiring files""" dummyfile = wStringIO.StringIO(tmxsource) print tmxsource tmxfile = tmx.tmxfile(dummyfile) return tmxfile
def get_storage(self): return tmxfile()
def tmxparse(self, tmxsource): """helper that parses tmx source without requiring files""" dummyfile = wStringIO.StringIO(tmxsource) print(tmxsource) tmxfile = tmx.tmxfile(dummyfile) return tmxfile
print(test.head()) test_source_sentences = test["source"].astype(str).tolist() test_target_sentences = test["target"].astype(str).tolist() source_sentences = list() target_sentences = list() retrived_target_sentences = list() bleu_scores = list() meteor_scores = list() tmx_file_path = "result/ES-ES/unapproved.tmx" with open(tmx_file_path, 'rb') as fin: tmx_file = tmxfile(fin, source, target) i = 0 for node in tmx_file.unit_iter(): i = i + 1 source_sentence = node.getsource().strip() retrieved_target_sentence = node.gettarget().strip() index = test_source_sentences.index( source_sentence ) if source_sentence in test_source_sentences else -1 if index > -1: target_sentence = test_target_sentences[index] bleu_score = calculate_bleu_score(target_sentence, retrieved_target_sentence) meteor_score = calculate_meteor_score(target_sentence,
def test_translate(self): tmxfile = tmx.tmxfile() assert tmxfile.translate("Anything") is None tmxfile.addtranslation("A string of characters", "en", "'n String karakters", "af") assert tmxfile.translate("A string of characters") == "'n String karakters"
def inittmx(inputfile, columnorder=None): return tmx.tmxfile(inputfile)
file_as_dict[key] = value return file_as_dict if __name__ == '__main__': #################################################################################################################### # USE THIS BLOCK TO PREPROCESS THE RAW tmx-files AND CREATE THIS FILE: ID \t italianSent \t germanSent opensub_it_de = dict() ID = 9000000 # the starting ID - this was chosen as all sentence IDs from tatoeba do NOT exceed 9,000,000 # this way opensubtitle senences are identifiable by an ID <= 9,000,000 in_path = "/home/pia/cluwll/de-it.tmx" with open(in_path, 'rb') as fin: tmx_file = tmxfile(fin, 'de', 'it') for node in tmx_file.unit_iter(): opensub_it_de[ID] = [node.gettarget(), node.getsource()] ID += 1 print(len(opensub_it_de)) print("done reading tmx - dict was generated") out_path = "/home/pia/cluwll/opensubt_id_it_de.txt" with open(out_path, 'wt') as fo: csv_writer = csv.writer(fo, delimiter='\t') for id, sent in opensub_it_de.items(): csv_writer.writerow([id, sent[0], sent[1]]) print("finished writing to file: ", out_path) ####################################################################################################################