def createBitext(pairs):
    bitext = []
    for pair in pairs:
        src = pair[0].split()
        trg = pair[1].split()
        bitext.append(AlignedSent(src, trg))
    return bitext
Пример #2
0
def bitext_from_documents(db, doc1, lang1, doc2, lang2):
    myDB = database.izwiDB(db, "")
    doc1id = myDB.get_docid(doc1, lang1)
    doc2id = myDB.get_docid(doc2, lang2)
    p_lengths1 = myDB.get_all_paragraph_lengths(doc1id)
    p_lengths2 = myDB.get_all_paragraph_lengths(doc2id)
    bitext = []
    for para1, para2 in align_blocks([x[1] for x in p_lengths1],
                                     [x[1] for x in p_lengths2]):
        # convert from index in the list above to the ID in the DB:
        para1 = p_lengths1[para1][0]
        para2 = p_lengths2[para2][0]
        s_lengths1 = myDB.get_sentence_lengths(doc1id, para1)
        s_lengths2 = myDB.get_sentence_lengths(doc2id, para2)
        for s1, s2 in align_blocks([x[1] for x in s_lengths1],
                                   [x[1] for x in s_lengths2]):
            s1 = s_lengths1[s1][0]
            s2 = s_lengths2[s2][0]
            bitext.append((s1, s2))

    return [
        AlignedSent(s.split(), t.split())
        for (s, t) in myDB.get_sentence_pairs(bitext)
    ]
Пример #3
0
from nltk.translate.ibm1 import IBMModel1
from nltk.translate.api import AlignedSent

# english = 'let ud all strive to live'.split()
# french = 'employons-nous tous à vivre et à laisser vivre'.split()

Foreign = ['das Haus', 'das Buch', 'ein Buch']
English = ['the house', 'the book', 'a book']

text = []
for i in range(3):
    text.append(AlignedSent(Foreign[i].split(), English[i].split()))

ibm1 = IBMModel1(text, 1)

for w1, d2 in ibm1.translation_table.items():
    for w2, p in d2.items():
        print(w1, '\t', w2, '\t', p)
Пример #4
0
                vocab[n] = toks[1]
            # NULL word index is 0
            vocab[0] = None
        return vocab


# takes a single file (tab-separated parallel sentences) as input from stdin
if __name__ == '__main__':
    bitext = []
    with sys.stdin as f:
        for line in f:
            line = line.rstrip()
            paral = re.split('\t', line)
            srcwords = re.split(' ', paral[0])
            tgtwords = re.split(' ', paral[1])
            bitext.append(AlignedSent(srcwords, tgtwords))

    if len(sys.argv) > 1:
        mywa = wordAligner(bitext, 5)
        ttablefile = sys.argv[1]
        atablefile = sys.argv[2]
        mywa.define_model_from_tables(ttablefile, atablefile)
    else:
        mywa = wordAligner(bitext, 5)
        mywa.train_model()

    for sentpair in mywa.aligned_corpus:
        aligninfo = mywa.align(sentpair)
        #print str(sentpair.words)+"\t"+str(sentpair.mots)+"\t"+str(sentpair.alignment)
        print str(aligninfo.src_sentence) + "\t" + str(
            aligninfo.trg_sentence) + "\t" + str(aligninfo.alignment)
Пример #5
0
    def get_phrase_table(self, mybwa, sent_bitext, src, tgt):
        # align each document at sentence, then word level
        # clearly not optimal since word alignment would benefit from training on all the new data
        # but that's a beginning

        if debug:
            for b in sent_bitext.bi_sent:
                print str(b[0].offset) + "\t" + str(b[1].offset)

        # from there, we can take the symmetrized alignments instead of a unidirectionnal one
        PT = []
        logging.debug("PHRASE LEVEL ALIGNMENT")
        sentence_count = 0

        bitext = sent_bitext.get_bitext()
        for bisent, sentpair in zip(sent_bitext.bi_sent, bitext):

            #print "SENTPAIR ="+str(sentpair)

            try:
                aligninfo = mybwa.wamodel.align(sentpair)
            except Exception as e:
                logging.warning("FAILED WORD ALIGNMENT: " + str(e) +
                                " FOR SENTENCE PAIR" + str(sentpair))
                continue
            inv_sentpair = AlignedSent(sentpair.mots, sentpair.words)
            try:
                raligninfo = mybwa.reverse_wamodel.align(inv_sentpair)
            except Exception as e:
                logging.warning("FAILED REVERSE WORD ALIGNMENT: " + str(e) +
                                " FOR SENTENCE PAIR" + str(inv_sentpair))
                continue

            #print "aligninfo::"+str(aligninfo.src_sentence)+"\t"+str(aligninfo.trg_sentence)+"\t"+str(aligninfo.alignment)
            #print "raligninfo::"+str(raligninfo.src_sentence)+"\t"+str(raligninfo.trg_sentence)+"\t"+str(raligninfo.alignment)

            f2e = ["%d-%d" % x for x in enumerate(aligninfo.alignment)]
            e2f = ["%d-%d" % x for x in enumerate(raligninfo.alignment)]

            srclen = len(aligninfo.src_sentence)
            tgtlen = len(aligninfo.trg_sentence)
            f2e_str = ' '.join(f2e)
            e2f_str = ' '.join(e2f)

            logging.debug("SRC LEN : " + str(srclen))
            logging.debug("TGT LEN : " + str(tgtlen))
            logging.debug("WORD ALIGNMENT E2F : " + e2f_str)
            logging.debug("WORD ALIGNMENT F2E : " + f2e_str)

            try:
                # conventions are inverted
                gdfa = self.gdfa_wrapper(tgtlen, srclen, f2e_str, e2f_str)
                logging.debug("BISENT =" + bisent[0].text.encode('utf-8') +
                              " --> " + bisent[1].text.encode('utf-8') +
                              " ::: GDFA: srclen=" + str(srclen) +
                              ", tgtlen=" + str(tgtlen) + ", RESULT = " +
                              str(gdfa))
            except timeout.TimeoutError:
                logging.warning("TIMEOUT in GDFA")
                logging.debug("GDFA ARGUMENTS WERE: srclen=" + str(srclen) +
                              ", tgtlen=" + str(tgtlen) + " E2F=" + e2f_str +
                              " F2E=" + f2e_str)
                continue
            alignment_without_null_word = set([])
            for i, j in gdfa:
                if i > 0 and j > 0:
                    alignment_without_null_word.add((i - 1, j - 1))
            try:
                PT.extend(
                    self.extract_phrase_table(bisent,
                                              alignment_without_null_word))
            except timeout.TimeoutError:
                logging.warning("TIMEOUT in extract phrase table")
                continue
            sentence_count += 1

        # store in a dictionary of dictionaries...of set
        logging.debug("STORING TOKEN RANGE MAPPINGS")
        dictPT = {}
        for pp in PT:
            if pp[0] not in dictPT:
                dictPT[pp[0]] = set([])
            dictPT[pp[0]].add(pp[1])
        return dictPT
Пример #6
0
 def get_inverted_bitext(self):
     bitext = []
     for bs in self.bi_sent:
         bitext.append(AlignedSent(bs[1].tokens, bs[0].tokens))
     return bitext
# with open(path+'ru-val.txt', 'wb') as f:
#     f.write(join(ru_val))
#
# ru_test = russian[int(0.995*len(russian)):]
# with open(path+'ru-test.txt', 'wb') as f:
#     f.write(join(ru_test))
#
# print('Done.')

print('Length of cyrillic', len(cyrillic))
print('Length of russian', len(russian))

aligned_text = []

for i in range(len(cyrillic)):
    aligned_sentence = AlignedSent(russian[i], cyrillic[i])
    aligned_text.append(aligned_sentence)

print(" \nTraining SMT model")
ibm_model = IBMModel2(aligned_text, 10)
print("Training complete")

print('Saving Model...')
with open(path + 'translation model.pkl', 'w') as tr_io:
    pickle.dump(ibm_model, tr_io)
print('Done.')

n_random = random.randint(0, len(cyrillic) - 1)
russian_sentence = russian[n_random]
cyrillic_actual_translation = cyrillic[n_random]