示例#1
0
def symmetrization(es, fs, corpus):
    '''
    forpus
        for translation from fs to es
    return
        alignment **from fs to es**
    '''
    f2e_train = ibmmodel2._train(corpus, loop_count=10)
    f2e = ibmmodel2.viterbi_alignment(es, fs, *f2e_train).items()

    e2f_corpus = list(zip(*reversed(list(zip(*corpus)))))
    e2f_train = ibmmodel2._train(e2f_corpus, loop_count=10)
    e2f = ibmmodel2.viterbi_alignment(fs, es, *e2f_train).items()

    return alignment(es, fs, e2f, f2e)
示例#2
0
def symmetrization(es, fs, corpus):
    '''
    forpus
        for translation from fs to es
    return
        alignment **from fs to es**
    '''
    f2e_train = ibmmodel2._train(corpus, loop_count=1000)
    f2e = ibmmodel2.viterbi_alignment(es, fs, *f2e_train).items()

    e2f_corpus = zip(*reversed(zip(*corpus)))
    e2f_train = ibmmodel2._train(e2f_corpus, loop_count=1000)
    e2f = ibmmodel2.viterbi_alignment(fs, es, *e2f_train).items()

    return alignment(es, fs, e2f, f2e)
示例#3
0
 def test_viterbi_alignment(self):
     x = viterbi_alignment([1, 2, 1], [2, 3, 2],
                           collections.defaultdict(int),
                           collections.defaultdict(int))
     # Viterbi_alignment selects the first token
     # if t or a doesn't contain the key.
     # This means it returns NULL token
     # in such a situation.
     self.assertEqual(x, {1: 1, 2: 1, 3: 1})
示例#4
0
 def test_viterbi_alignment(self):
     x = viterbi_alignment([1, 2, 1],
                           [2, 3, 2],
                           collections.defaultdict(int),
                           collections.defaultdict(int))
     # Viterbi_alignment selects the first token
     # if t or a doesn't contain the key.
     # This means it returns NULL token
     # in such a situation.
     self.assertEqual(x, {1: 1, 2: 1, 3: 1})
示例#5
0
    delimiter = ","
    # load file which will be trained
    modelfd = open(sys.argv[1])
    sentenses = [line.rstrip().split(delimiter) for line
                 in modelfd.readlines()]
    # make corpus
    corpus = mkcorpus(sentenses)

    # train model from corpus
    f2e_train = ibmmodel2._train(corpus, loop_count=10)
    e2f_corpus = list(zip(*reversed(list(zip(*corpus)))))
    e2f_train = ibmmodel2._train(e2f_corpus, loop_count=10)

    # phrase extraction
    for line in sys.stdin:
        _es, _fs = line.rstrip().split(delimiter)
        es = _es.split()
        fs = _fs.split()

        f2e = ibmmodel2.viterbi_alignment(es, fs, *f2e_train).items()
        e2f = ibmmodel2.viterbi_alignment(fs, es, *e2f_train).items()
        align = alignment(es, fs, e2f, f2e)  # symmetrized alignment

        # output matrix
        #from smt.utils.utility import matrix
        #print(matrix(len(es), len(fs), align, es, fs))

        ext = phrase_extract(es, fs, align)
        for e, f in ext:
            print("{}{}{}".format(''.join(e), delimiter, ''.join(f)))
示例#6
0
    # load file which will be trained
    modelfd = open(sys.argv[1])
    sentenses = [
        line.rstrip().split(delimiter) for line in modelfd.readlines()
    ]
    # make corpus
    corpus = mkcorpus(sentenses)

    # train model from corpus
    f2e_train = ibmmodel2._train(corpus, loop_count=10)
    e2f_corpus = list(zip(*reversed(list(zip(*corpus)))))
    e2f_train = ibmmodel2._train(e2f_corpus, loop_count=10)

    # phrase extraction
    for line in sys.stdin:
        _es, _fs = line.rstrip().split(delimiter)
        es = _es.split()
        fs = _fs.split()

        f2e = ibmmodel2.viterbi_alignment(es, fs, *f2e_train).items()
        e2f = ibmmodel2.viterbi_alignment(fs, es, *e2f_train).items()
        align = alignment(es, fs, e2f, f2e)  # symmetrized alignment

        # output matrix
        #from smt.utils.utility import matrix
        #print(matrix(len(es), len(fs), align, es, fs))

        ext = phrase_extract(es, fs, align)
        for e, f in ext:
            print("{}{}{}".format(''.join(e), delimiter, ''.join(f)))