def extract(self, words, ignore_out): print >>sys.stderr, "LM Extractor %s %s" % (len(words), len(ignore_out)) # the fsa self.fsa = openfst.StdVectorFst() initial = self.fsa.AddState() self.fsa.SetStart(initial) states = {} # RHO means anything allowed self.fsa.AddArc(initial, fsa.simple_arc(fsa.RHO, 0.0, initial)) # initial arcs for (word1, output) in words: # add a starting arc to each word score = self.score("<s>", "<s>", word1) i_pair = ("<s>", word1) #print >>sys.stderr, i_pair, score, output states[i_pair] = self.fsa.AddState() self.fsa.AddArc(initial, fsa.simple_arc(output, score, states[i_pair])) self.fsa.AddArc(states[i_pair], fsa.simple_arc(fsa.RHO, 0.0, states[i_pair])) for (word2, output) in words: # add a second starting arc to each word pair score = self.score("<s>", word1, word2) pair= (word1, word2) states[pair] = self.fsa.AddState() self.fsa.AddArc(states[i_pair], StdArc(output, output, score, states[pair])) print >>sys.stderr, "starting inner" for (word1,_) in words: for (word2,_) in words: # connect each word pair pair = (word1, word2) word_state = states[pair] for (to_word, word_output) in words: to_word_state = states[word2, to_word] score = self.score(word1, word2, to_word) self.fsa.AddArc(word_state, fsa.simple_arc(word_output, score, to_word_state)) # connect to the final state self.fsa.AddArc(word_state, fsa.simple_arc(fsa.RHO, 0.0, word_state)) end_score = self.score(word1, word2, "</s>") self.fsa.SetFinal(word_state, end_score) print >>sys.stderr, "done inner" # create symbol table for reading print >>sys.stderr, "Done" self.fsa.SetInputSymbols(self.s_table) self.fsa.SetOutputSymbols(self.s_table) return self.fsa
def extract(self, original_length, words, ignore_out, step_weight, weights, length_penalty): # ignore TOP UP and TOP DOWN l = original_length -1 #self.text_weight = text_weight # create and initialize the fst self.counter_fst = openfst.StdVectorFst() first = self.counter_fst.AddState() self.counter_fst.SetStart(first) states = [first] # Counting fsa has 2*l states (up and down for each NT) for i in range(2*l+1): states.append(self.counter_fst.AddState()) for i in range(len(states)-1): #self.counter_fst.SetFinal(states[-1], 0.0) # Don't count words self.counter_fst.AddArc(states[i], fsa.simple_arc(fsa.RHO, 0.0, states[i])) #for (_, word_output) in words: #self.fsa.AddArc(first, StdArc(fsa.RHO, fsa.RHO, 0.0, first)) #self.counter_fst.AddArc(states[i], StdArc(word_output, word_output, 0.0, states[i])) # for each non word #for ((s, motion), sym) in ignore_out: # All non words increase count by 1 #self.counter_fst.AddArc(states[i], fsa.simple_arc(sym, 0.0, states[i+1])) self.counter_fst.AddArc(states[i], fsa.simple_arc(SRC_NODE, step_weight, states[i+1])) self.counter_fst.AddArc(states[i], fsa.simple_arc(PRE_WORD, length_penalty, states[i])) #self.counter_fst.SetFinal(states[i], 0.0) # for (edge_jump, output) in edge_jump: #self.counter_fst.AddArc(states[-2], StdArc(0, 0, 0.0, states[-1])) # RHO arc eats anything else #self.counter_fst.AddArc(states[-2], StdArc(fsa.RHO, fsa.RHO, 0.0, states[-2])) # Last node is a sink, so second to last is final self.counter_fst.SetFinal(states[-2], 0.0) #self.counter_fst.SetNotFinal(states[-1]) #self.counter_fst.SetNotFinal(states[-2]) self.counter_fst.SetInputSymbols(self.s_table) self.counter_fst.SetOutputSymbols(self.s_table) return self.counter_fst