예제 #1
0
 def predict(self, output_file):
     self.num_lines = len(self.old_test)  # for the status bar
     with open(output_file, 'w') as f:
         fst_m = fst.compose(self.fst_mlm, self.fst_mtm)
         l = 0
         for old_line, new_line in zip(self.old_test, self.new_test):
             fst_mw = fst_wrapper.get_fst_mw(old_line)
             # compose the lm, tm, and wm, and find the best path
             _fst = fst.compose(fst_m, fst_mw)
             path = viterbi.viterbi_path(_fst)
             predicted_line = ''
             for p in path[0]:
                 if p[0][0][0] in self.fst_mlm.input_alphabet:
                     predicted_line += p[0][0][0]
             # print out the first 10 lines
             if l < 10:
                 sys.stdout.write(predicted_line)
                 print(path[1])
             else:  # show the progress bar
                 self.status_bar(l)
             l += 1
             # write the prediction to the file
             f.write(predicted_line)
     # print out the transitions with weight greater than 0.1
     for state, transitions in self.fst_mtm.transitions_to.items():
         for transition, weight in transitions.items():
             if weight >= 0.1:
                 print(str(transition) + ' = ' + str(weight))
예제 #2
0
 def train(self, output_file, iterations):
     # construct the initial unweighted typo model
     self.fst_mtm = fst_wrapper.get_fst_mtm(self.old_train, self.new_train,
                                            False)
     for _ in range(iterations):  # iterate
         self.num_lines = len(self.old_train)  # for the status bar
         l = 0
         self.status_bar(l)  # track progress with a status bar
         # train on parallel text
         for old_line, new_line in zip(self.old_train, self.new_train):
             # construct the fst models for modern and old lines
             fst_mm = fst_wrapper.get_fst_mw(new_line)
             fst_me = fst_wrapper.get_fst_mw(old_line)
             #compose the models and find the shortest path
             _fst = fst.compose(fst.compose(fst_mm, self.fst_mtm), fst_me)
             viterbi.viterbi_path(fst=_fst, get_counts=True)
             # reweight the tm with the new counts and reweight
             for t, count in viterbi.counts.items():
                 self.fst_mtm.reweight_transition(t, count)
             self.fst_mtm.normalize_cond(.01)
             l += 1
             self.status_bar(l)
         print()  # add a line after the status bar
         self.predict(output_file)
         # print the overall score
         print('SCORE: ', end='')
         print(cer.cer(zip(self.new_test, list(open(output_file)))))
예제 #3
0
 def predict(self, old_line, print_lines=True):
     fst_mw = fst_wrapper.get_fst_mw(old_line)
     _fst = fst.compose(self.fst_m, fst_mw)
     path = viterbi.viterbi_path(_fst)
     # reconstruct the path
     predicted_line = ''
     for p in path[0]:
         if p[0][0][0] in _fst.input_alphabet:
             predicted_line += p[0][0][0]
     # print out the modern line with log probability
     if print_lines:
         print(predicted_line, end='')
         print(path[1])
     return predicted_line
예제 #4
0
        q = ptr[q].q
    path.reverse()
    return vit[m.accept], path


if __name__ == '__main__':
    print('========== CONSTRUCT MODELS ==========')
    mlm = lm.make_kneserney(process('data/train.en'), 2)
    mtm = make_tm(ibm_model1.make(), 'data/test.zh')
    # Iterate through each line of the data
    print('========== TESTING ===================')
    with open('data/test.out', 'w') as wf:
        for i, fs in enumerate(process('data/test.zh')):
            mf = make_fm(fs)
            # Compose all fst models
            m = fst.compose(fst.compose(mf, mtm), mlm)
            # Calculate the shortest path
            try:
                wt, path = viterbi(m, key=lambda q: (q[0][0], -1 * len(q[1])))
                out = " ".join(
                    [t.a[1] for t in path[:-1] if t.a[1] != fst.EPSILON])
                # Print first 10 translations
                if i < 10:
                    print('%s' % out)
                # Print output to help with tracking how far along translations are
                # else:
                # print('\rLine #: %d' % i, end='')
                wf.write('%s\n' % out)
            except ValueError as e:
                if i < 10:
                    print('')
예제 #5
0
# We can encode a string into a sequence of digits

# To create an FST that encodes a string, we first get a
# list with each symbol
str = "mary saw the dog in the park with the telescope"
letters = list(str)

# Then use linearchain().
# The first argument is the list of symbols,
# the second argument is an optional FST to take the symbol
# table from, and the last argument is an optional list of
# list items to ignore.
infst = fst.linearchain(letters, let2dig, [' '])

# Now encode the input FST (letters) into digits
encfst = fst.compose(infst, let2dig)

print("Input string: ", encfst.get_in_string())
print()
print("Output string:", encfst.get_out_string())
print()
print("Now reconstructing input string from the output string...\n")

# Then go from digits back to letters
decfst = fst.compose(encfst, dig2let)

# The mapping is now one to many, so we can choose out
# strings based on unigram score

# Compose the decoded string FST with the unigram model FST
decscored = fst.compose(decfst, let2word)
예제 #6
0
                prev = i.q[1][0]            

    return ' '.join(reconstructed)


if __name__ == "__main__":
    M_LM = make_knes()
    M_TM = make_TM()

    test_file = '../data/final_data/test.tr'

    to_write = []

    for i, line in enumerate(open(test_file)):

        line = line.strip()
        M_f = make_f(line)

        composed = fst.compose(fst.compose(M_f, M_TM), M_LM)

        out = viterbi(composed)
        print(i, out)
    
        to_write.append(out)


    write_file = open('test.translations', 'w')
    for line in to_write:
        write_file.write(line + '\n')
        
예제 #7
0
 def train(self):
     fst_mlm = fst.make_ngram(self.new_train, 2)
     fst_mtm = fst_wrapper.get_fst_mtm(self.old_train, self.new_train)
     self.fst_m = fst.compose(fst_mlm, fst_mtm)