def createBitext(pairs): bitext = [] for pair in pairs: src = pair[0].split() trg = pair[1].split() bitext.append(AlignedSent(src, trg)) return bitext
def bitext_from_documents(db, doc1, lang1, doc2, lang2): myDB = database.izwiDB(db, "") doc1id = myDB.get_docid(doc1, lang1) doc2id = myDB.get_docid(doc2, lang2) p_lengths1 = myDB.get_all_paragraph_lengths(doc1id) p_lengths2 = myDB.get_all_paragraph_lengths(doc2id) bitext = [] for para1, para2 in align_blocks([x[1] for x in p_lengths1], [x[1] for x in p_lengths2]): # convert from index in the list above to the ID in the DB: para1 = p_lengths1[para1][0] para2 = p_lengths2[para2][0] s_lengths1 = myDB.get_sentence_lengths(doc1id, para1) s_lengths2 = myDB.get_sentence_lengths(doc2id, para2) for s1, s2 in align_blocks([x[1] for x in s_lengths1], [x[1] for x in s_lengths2]): s1 = s_lengths1[s1][0] s2 = s_lengths2[s2][0] bitext.append((s1, s2)) return [ AlignedSent(s.split(), t.split()) for (s, t) in myDB.get_sentence_pairs(bitext) ]
from nltk.translate.ibm1 import IBMModel1 from nltk.translate.api import AlignedSent # english = 'let ud all strive to live'.split() # french = 'employons-nous tous à vivre et à laisser vivre'.split() Foreign = ['das Haus', 'das Buch', 'ein Buch'] English = ['the house', 'the book', 'a book'] text = [] for i in range(3): text.append(AlignedSent(Foreign[i].split(), English[i].split())) ibm1 = IBMModel1(text, 1) for w1, d2 in ibm1.translation_table.items(): for w2, p in d2.items(): print(w1, '\t', w2, '\t', p)
vocab[n] = toks[1] # NULL word index is 0 vocab[0] = None return vocab # takes a single file (tab-separated parallel sentences) as input from stdin if __name__ == '__main__': bitext = [] with sys.stdin as f: for line in f: line = line.rstrip() paral = re.split('\t', line) srcwords = re.split(' ', paral[0]) tgtwords = re.split(' ', paral[1]) bitext.append(AlignedSent(srcwords, tgtwords)) if len(sys.argv) > 1: mywa = wordAligner(bitext, 5) ttablefile = sys.argv[1] atablefile = sys.argv[2] mywa.define_model_from_tables(ttablefile, atablefile) else: mywa = wordAligner(bitext, 5) mywa.train_model() for sentpair in mywa.aligned_corpus: aligninfo = mywa.align(sentpair) #print str(sentpair.words)+"\t"+str(sentpair.mots)+"\t"+str(sentpair.alignment) print str(aligninfo.src_sentence) + "\t" + str( aligninfo.trg_sentence) + "\t" + str(aligninfo.alignment)
def get_phrase_table(self, mybwa, sent_bitext, src, tgt): # align each document at sentence, then word level # clearly not optimal since word alignment would benefit from training on all the new data # but that's a beginning if debug: for b in sent_bitext.bi_sent: print str(b[0].offset) + "\t" + str(b[1].offset) # from there, we can take the symmetrized alignments instead of a unidirectionnal one PT = [] logging.debug("PHRASE LEVEL ALIGNMENT") sentence_count = 0 bitext = sent_bitext.get_bitext() for bisent, sentpair in zip(sent_bitext.bi_sent, bitext): #print "SENTPAIR ="+str(sentpair) try: aligninfo = mybwa.wamodel.align(sentpair) except Exception as e: logging.warning("FAILED WORD ALIGNMENT: " + str(e) + " FOR SENTENCE PAIR" + str(sentpair)) continue inv_sentpair = AlignedSent(sentpair.mots, sentpair.words) try: raligninfo = mybwa.reverse_wamodel.align(inv_sentpair) except Exception as e: logging.warning("FAILED REVERSE WORD ALIGNMENT: " + str(e) + " FOR SENTENCE PAIR" + str(inv_sentpair)) continue #print "aligninfo::"+str(aligninfo.src_sentence)+"\t"+str(aligninfo.trg_sentence)+"\t"+str(aligninfo.alignment) #print "raligninfo::"+str(raligninfo.src_sentence)+"\t"+str(raligninfo.trg_sentence)+"\t"+str(raligninfo.alignment) f2e = ["%d-%d" % x for x in enumerate(aligninfo.alignment)] e2f = ["%d-%d" % x for x in enumerate(raligninfo.alignment)] srclen = len(aligninfo.src_sentence) tgtlen = len(aligninfo.trg_sentence) f2e_str = ' '.join(f2e) e2f_str = ' '.join(e2f) logging.debug("SRC LEN : " + str(srclen)) logging.debug("TGT LEN : " + str(tgtlen)) logging.debug("WORD ALIGNMENT E2F : " + e2f_str) logging.debug("WORD ALIGNMENT F2E : " + f2e_str) try: # conventions are inverted gdfa = self.gdfa_wrapper(tgtlen, srclen, f2e_str, e2f_str) logging.debug("BISENT =" + bisent[0].text.encode('utf-8') + " --> " + bisent[1].text.encode('utf-8') + " ::: GDFA: srclen=" + str(srclen) + ", tgtlen=" + str(tgtlen) + ", RESULT = " + str(gdfa)) except timeout.TimeoutError: logging.warning("TIMEOUT in GDFA") logging.debug("GDFA ARGUMENTS WERE: srclen=" + str(srclen) + ", tgtlen=" + str(tgtlen) + " E2F=" + e2f_str + " F2E=" + f2e_str) continue alignment_without_null_word = set([]) for i, j in gdfa: if i > 0 and j > 0: alignment_without_null_word.add((i - 1, j - 1)) try: PT.extend( self.extract_phrase_table(bisent, alignment_without_null_word)) except timeout.TimeoutError: logging.warning("TIMEOUT in extract phrase table") continue sentence_count += 1 # store in a dictionary of dictionaries...of set logging.debug("STORING TOKEN RANGE MAPPINGS") dictPT = {} for pp in PT: if pp[0] not in dictPT: dictPT[pp[0]] = set([]) dictPT[pp[0]].add(pp[1]) return dictPT
def get_inverted_bitext(self): bitext = [] for bs in self.bi_sent: bitext.append(AlignedSent(bs[1].tokens, bs[0].tokens)) return bitext
# with open(path+'ru-val.txt', 'wb') as f: # f.write(join(ru_val)) # # ru_test = russian[int(0.995*len(russian)):] # with open(path+'ru-test.txt', 'wb') as f: # f.write(join(ru_test)) # # print('Done.') print('Length of cyrillic', len(cyrillic)) print('Length of russian', len(russian)) aligned_text = [] for i in range(len(cyrillic)): aligned_sentence = AlignedSent(russian[i], cyrillic[i]) aligned_text.append(aligned_sentence) print(" \nTraining SMT model") ibm_model = IBMModel2(aligned_text, 10) print("Training complete") print('Saving Model...') with open(path + 'translation model.pkl', 'w') as tr_io: pickle.dump(ibm_model, tr_io) print('Done.') n_random = random.randint(0, len(cyrillic) - 1) russian_sentence = russian[n_random] cyrillic_actual_translation = cyrillic[n_random]