Python AlignedSent примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk.translate.api

Класс/Тип: AlignedSent

Примеров на hotexamples.com: 7

Python AlignedSent - 7 примеров найдено. Это лучшие примеры Python кода для nltk.translate.api.AlignedSent, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

AlignedSent(7)

Основные методы

AlignedSent (7)

Пример #1

Показать файл

Файл: phrase_based_model.py Проект: dharakyu/Machine-Translation

def createBitext(pairs):
    bitext = []
    for pair in pairs:
        src = pair[0].split()
        trg = pair[1].split()
        bitext.append(AlignedSent(src, trg))
    return bitext

Пример #2

Показать файл

Файл: sentencealigner.py Проект: luuik/termextract

def bitext_from_documents(db, doc1, lang1, doc2, lang2):
    myDB = database.izwiDB(db, "")
    doc1id = myDB.get_docid(doc1, lang1)
    doc2id = myDB.get_docid(doc2, lang2)
    p_lengths1 = myDB.get_all_paragraph_lengths(doc1id)
    p_lengths2 = myDB.get_all_paragraph_lengths(doc2id)
    bitext = []
    for para1, para2 in align_blocks([x[1] for x in p_lengths1],
                                     [x[1] for x in p_lengths2]):
        # convert from index in the list above to the ID in the DB:
        para1 = p_lengths1[para1][0]
        para2 = p_lengths2[para2][0]
        s_lengths1 = myDB.get_sentence_lengths(doc1id, para1)
        s_lengths2 = myDB.get_sentence_lengths(doc2id, para2)
        for s1, s2 in align_blocks([x[1] for x in s_lengths1],
                                   [x[1] for x in s_lengths2]):
            s1 = s_lengths1[s1][0]
            s2 = s_lengths2[s2][0]
            bitext.append((s1, s2))

    return [
        AlignedSent(s.split(), t.split())
        for (s, t) in myDB.get_sentence_pairs(bitext)
    ]

Пример #3

Показать файл

from nltk.translate.ibm1 import IBMModel1
from nltk.translate.api import AlignedSent

# english = 'let ud all strive to live'.split()
# french = 'employons-nous tous à vivre et à laisser vivre'.split()

Foreign = ['das Haus', 'das Buch', 'ein Buch']
English = ['the house', 'the book', 'a book']

text = []
for i in range(3):
    text.append(AlignedSent(Foreign[i].split(), English[i].split()))

ibm1 = IBMModel1(text, 1)

for w1, d2 in ibm1.translation_table.items():
    for w2, p in d2.items():
        print(w1, '\t', w2, '\t', p)

Пример #4

Показать файл

                vocab[n] = toks[1]
            # NULL word index is 0
            vocab[0] = None
        return vocab


# takes a single file (tab-separated parallel sentences) as input from stdin
if __name__ == '__main__':
    bitext = []
    with sys.stdin as f:
        for line in f:
            line = line.rstrip()
            paral = re.split('\t', line)
            srcwords = re.split(' ', paral[0])
            tgtwords = re.split(' ', paral[1])
            bitext.append(AlignedSent(srcwords, tgtwords))

    if len(sys.argv) > 1:
        mywa = wordAligner(bitext, 5)
        ttablefile = sys.argv[1]
        atablefile = sys.argv[2]
        mywa.define_model_from_tables(ttablefile, atablefile)
    else:
        mywa = wordAligner(bitext, 5)
        mywa.train_model()

    for sentpair in mywa.aligned_corpus:
        aligninfo = mywa.align(sentpair)
        #print str(sentpair.words)+"\t"+str(sentpair.mots)+"\t"+str(sentpair.alignment)
        print str(aligninfo.src_sentence) + "\t" + str(
            aligninfo.trg_sentence) + "\t" + str(aligninfo.alignment)

Пример #5

Показать файл

Файл: aligner.py Проект: luuik/termextract

    def get_phrase_table(self, mybwa, sent_bitext, src, tgt):
        # align each document at sentence, then word level
        # clearly not optimal since word alignment would benefit from training on all the new data
        # but that's a beginning

        if debug:
            for b in sent_bitext.bi_sent:
                print str(b[0].offset) + "\t" + str(b[1].offset)

        # from there, we can take the symmetrized alignments instead of a unidirectionnal one
        PT = []
        logging.debug("PHRASE LEVEL ALIGNMENT")
        sentence_count = 0

        bitext = sent_bitext.get_bitext()
        for bisent, sentpair in zip(sent_bitext.bi_sent, bitext):

            #print "SENTPAIR ="+str(sentpair)

            try:
                aligninfo = mybwa.wamodel.align(sentpair)
            except Exception as e:
                logging.warning("FAILED WORD ALIGNMENT: " + str(e) +
                                " FOR SENTENCE PAIR" + str(sentpair))
                continue
            inv_sentpair = AlignedSent(sentpair.mots, sentpair.words)
            try:
                raligninfo = mybwa.reverse_wamodel.align(inv_sentpair)
            except Exception as e:
                logging.warning("FAILED REVERSE WORD ALIGNMENT: " + str(e) +
                                " FOR SENTENCE PAIR" + str(inv_sentpair))
                continue

            #print "aligninfo::"+str(aligninfo.src_sentence)+"\t"+str(aligninfo.trg_sentence)+"\t"+str(aligninfo.alignment)
            #print "raligninfo::"+str(raligninfo.src_sentence)+"\t"+str(raligninfo.trg_sentence)+"\t"+str(raligninfo.alignment)

            f2e = ["%d-%d" % x for x in enumerate(aligninfo.alignment)]
            e2f = ["%d-%d" % x for x in enumerate(raligninfo.alignment)]

            srclen = len(aligninfo.src_sentence)
            tgtlen = len(aligninfo.trg_sentence)
            f2e_str = ' '.join(f2e)
            e2f_str = ' '.join(e2f)

            logging.debug("SRC LEN : " + str(srclen))
            logging.debug("TGT LEN : " + str(tgtlen))
            logging.debug("WORD ALIGNMENT E2F : " + e2f_str)
            logging.debug("WORD ALIGNMENT F2E : " + f2e_str)

            try:
                # conventions are inverted
                gdfa = self.gdfa_wrapper(tgtlen, srclen, f2e_str, e2f_str)
                logging.debug("BISENT =" + bisent[0].text.encode('utf-8') +
                              " --> " + bisent[1].text.encode('utf-8') +
                              " ::: GDFA: srclen=" + str(srclen) +
                              ", tgtlen=" + str(tgtlen) + ", RESULT = " +
                              str(gdfa))
            except timeout.TimeoutError:
                logging.warning("TIMEOUT in GDFA")
                logging.debug("GDFA ARGUMENTS WERE: srclen=" + str(srclen) +
                              ", tgtlen=" + str(tgtlen) + " E2F=" + e2f_str +
                              " F2E=" + f2e_str)
                continue
            alignment_without_null_word = set([])
            for i, j in gdfa:
                if i > 0 and j > 0:
                    alignment_without_null_word.add((i - 1, j - 1))
            try:
                PT.extend(
                    self.extract_phrase_table(bisent,
                                              alignment_without_null_word))
            except timeout.TimeoutError:
                logging.warning("TIMEOUT in extract phrase table")
                continue
            sentence_count += 1

        # store in a dictionary of dictionaries...of set
        logging.debug("STORING TOKEN RANGE MAPPINGS")
        dictPT = {}
        for pp in PT:
            if pp[0] not in dictPT:
                dictPT[pp[0]] = set([])
            dictPT[pp[0]].add(pp[1])
        return dictPT

Пример #6

Показать файл

Файл: aligner.py Проект: luuik/termextract

 def get_inverted_bitext(self):
     bitext = []
     for bs in self.bi_sent:
         bitext.append(AlignedSent(bs[1].tokens, bs[0].tokens))
     return bitext

Пример #7

Показать файл

Файл: ibm_smt_translator.py Проект: mayowaosibodu/Language-Project-Scripts

# with open(path+'ru-val.txt', 'wb') as f:
#     f.write(join(ru_val))
#
# ru_test = russian[int(0.995*len(russian)):]
# with open(path+'ru-test.txt', 'wb') as f:
#     f.write(join(ru_test))
#
# print('Done.')

print('Length of cyrillic', len(cyrillic))
print('Length of russian', len(russian))

aligned_text = []

for i in range(len(cyrillic)):
    aligned_sentence = AlignedSent(russian[i], cyrillic[i])
    aligned_text.append(aligned_sentence)

print(" \nTraining SMT model")
ibm_model = IBMModel2(aligned_text, 10)
print("Training complete")

print('Saving Model...')
with open(path + 'translation model.pkl', 'w') as tr_io:
    pickle.dump(ibm_model, tr_io)
print('Done.')

n_random = random.randint(0, len(cyrillic) - 1)
russian_sentence = russian[n_random]
cyrillic_actual_translation = cyrillic[n_random]