def make_tm(t, testfile): tm = fst.FST() tm.set_start('q0') tm.set_accept('q1') tm.add_transition(fst.Transition('q0', (fst.STOP, fst.STOP), 'q1')) known_words = set() # Store the transitions in a new format top_trans = defaultdict(dict) for trans, prob in t.items(): if trans[1] == '∅': top_trans[trans[0]][fst.EPSILON] = prob else: top_trans[trans[0]][trans[1]] = prob known_words.add(trans[0]) # Find and insert the top 10 translations for fw, trans in top_trans.items(): for i, (ew, prob) in enumerate( sorted(trans.items(), key=operator.itemgetter(1), reverse=True)): if i > 10: break tm.add_transition(fst.Transition('q0', (fw, ew), 'q0'), prob) # Add unknown words from the test data with open(testfile) as f: prob = math.pow(10, -100) for line in f: for w in line.rstrip().split(): if w not in known_words: tm.add_transition( fst.Transition('q0', (w, fst.EPSILON), 'q0'), prob) return tm
def make_fm(f): fm = fst.FST() fm.set_start(0) for i, fw in enumerate(f): fm.add_transition(fst.Transition(i, (fw, fw), i + 1)) fm.add_transition( fst.Transition(len(fs), (fst.STOP, fst.STOP), len(fs) + 1)) fm.set_accept(len(f) + 1) return fm
def get_fst_mw(word): m = fst.FST() m.set_start("q0") n = 1 for w in word: m.add_transition(fst.Transition("q"+str(n-1), (w, w), "q"+str(n))) n += 1 m.add_transition(fst.Transition("q"+str(n-1), (fst.STOP, fst.STOP), "q"+str(n))) m.set_accept("q"+str(n)) return m
def make_f(f): # Adapted from Homework 2 Solutions f = f.split() m = fst.FST() m.set_start(0) for (i,a) in enumerate(f): m.add_transition(fst.Transition(i, (a, a), i+1)) m.add_transition(fst.Transition(len(f), (fst.STOP, fst.STOP), len(f)+1)) m.set_accept(len(f)+1) return m
def make_kneserney(data, n): """Create a Kneser-Ney smoothed language model of order `n`, trained on `data`, as a `FST`. Note that the returned FST has epsilon transitions. To iterate over states in topological order, sort them using `lambda q: -len(q)` as the key. """ # Estimate KN-smoothed models for orders 1, ..., n kn = {} for i in range(1, n + 1): kn[i] = KneserNey(data, i) # Create the FST. It has a state for every possible k-gram for k = 0, ..., n-1. m = fst.FST() m.set_start(("<s>", ) * (n - 1)) m.set_accept(("</s>", )) for i in range(1, n + 1): for u in kn[i]._prob: if i > 1: # Add an epsilon transition that backs off from the i-gram model to the (i-1)-gram model m.add_transition( fst.Transition(u, (fst.EPSILON, fst.EPSILON), u[1:]), kn[i]._bow[u]) else: # Smooth 1-gram model with uniform distribution types = len(kn[i]._prob[u]) + 1 for w in kn[i]._prob[u]: m.add_transition(fst.Transition(u, (w, w), (w, )), 1 / types) m.add_transition(fst.Transition(u, ("<unk>", "<unk>"), ()), 1 / types) # Create transitions for word probabilities for w in kn[i]._prob[u]: # If we are in state u and read w, then v is the new state. # This should be the longest suffix of uw that is observed # in the training data. if w == "</s>": v = ("</s>", ) else: v = u + (w, ) while len(v) > 0 and (len(v) >= n or v not in kn[len(v) + 1]._prob): v = v[1:] m.add_transition(fst.Transition(u, (w, w), v), kn[i]._prob[u][w]) return m
def make_tm(t, testfile): tm = fst.FST() tm.set_start('q0') tm.set_accept('q1') tm.add_transition(fst.Transition('q0', (fst.STOP, fst.STOP), 'q1')) known_words = set() for trans, prob in t.items(): known_words.add(trans[0]) if trans[1] == '∅': trans = (trans[0], fst.EPSILON) tm.add_transition(fst.Transition('q0', trans, 'q0'), prob) # Add unknown words from the test data with open(testfile) as f: prob = math.pow(10, -100) for line in f: for w in line.rstrip().split(): if w not in known_words: tm.add_transition( fst.Transition('q0', (w, fst.EPSILON), 'q0'), prob) return tm
def get_fst_mtm(old_data, new_data, initialize=True): m = fst.FST() m.set_start("q0") # get the old and modern alphabets from the traning data for old_line in old_data: for w in old_line: output_alphabet.add(w) for new_line in new_data: for w in new_line: input_alphabet.add(w) # generate the typo model for output_w in output_alphabet: m.add_transition(fst.Transition("q0", (fst.EPSILON, output_w), "q0")) #insert for input_w in input_alphabet: m.add_transition(fst.Transition("q0", (input_w, fst.EPSILON), "q1")) #delete for output_w in output_alphabet: # substitute m.add_transition(fst.Transition("q1", (input_w, output_w), "q0")) m.add_transition(fst.Transition("q0", (input_w, output_w), "q0")) # add terminal transitions m.add_transition(fst.Transition("q0", (fst.STOP, fst.STOP), "q2")) m.add_transition(fst.Transition("q1", (fst.STOP, fst.STOP), "q2")) m.set_accept("q2") # initialize the weights if initialize: for state in m.states: for transition in m.transitions_from[state].keys(): # higher probability if going to the same character if transition.a[0] == transition.a[1]: m.reweight_transition(transition, 100) else: m.reweight_transition(transition, 1) m.normalize_cond() return m
def make_TM(): # Code adapted from Homework 2 Solution translations = read_translations() tm = fst.FST() tm.set_start(0) tm.set_accept(1) tm.add_transition(fst.Transition(0, ("</s>", "</s>"), 1), wt=1) for t in translations: for prob in translations[t]: tm.add_transition(fst.Transition(0, (t, prob[0]), 0), wt=float(prob[1])) test = '../data/final_data/test.tr' test_set = set() for test_line in open(test): for char in test_line.strip().split(): test_set.add(char) for char in test_set: tm.add_transition(fst.Transition(0, (char, 'ε'), 0), wt=float('1.0e-100')) return tm