예제 #1
0
  def extract(self, words, ignore_out):
    print >>sys.stderr, "LM Extractor %s %s" % (len(words), len(ignore_out)) 

    # the fsa
    self.fsa = openfst.StdVectorFst() 

    initial = self.fsa.AddState()
    self.fsa.SetStart(initial)
    states = {}


    # RHO means anything allowed
    self.fsa.AddArc(initial, fsa.simple_arc(fsa.RHO, 0.0, initial))

    # initial arcs
    for (word1, output) in words:

      # add a starting arc to each word
      score = self.score("<s>", "<s>", word1)
      i_pair = ("<s>", word1)
      #print >>sys.stderr, i_pair, score, output

      states[i_pair] = self.fsa.AddState()
      self.fsa.AddArc(initial, fsa.simple_arc(output, score, states[i_pair]))
      self.fsa.AddArc(states[i_pair], fsa.simple_arc(fsa.RHO, 0.0, states[i_pair]))


      for (word2, output) in words:
        # add a second starting arc to each word pair
        score = self.score("<s>", word1, word2)
        pair= (word1, word2)
        states[pair] = self.fsa.AddState()
        self.fsa.AddArc(states[i_pair], StdArc(output, output, score, states[pair])) 


    print >>sys.stderr, "starting inner"
    for (word1,_) in words:
      for (word2,_) in words:
        # connect each word pair
        pair = (word1, word2)
        word_state = states[pair]
        for (to_word, word_output) in words:
          to_word_state = states[word2, to_word]
          score = self.score(word1, word2, to_word)
          self.fsa.AddArc(word_state, fsa.simple_arc(word_output, score, to_word_state))

        # connect to the final state
        self.fsa.AddArc(word_state, fsa.simple_arc(fsa.RHO, 0.0, word_state))
        end_score = self.score(word1, word2, "</s>")
        self.fsa.SetFinal(word_state, end_score)
    print >>sys.stderr, "done inner"
    # create symbol table for reading
    
    print >>sys.stderr, "Done"
    self.fsa.SetInputSymbols(self.s_table)
    self.fsa.SetOutputSymbols(self.s_table)
    
    return self.fsa
예제 #2
0
  def extract(self, original_length, words, ignore_out, step_weight, weights, length_penalty):
    # ignore TOP UP and TOP DOWN
    l = original_length -1
    
    #self.text_weight = text_weight

    # create and initialize the fst
    self.counter_fst = openfst.StdVectorFst()
    first = self.counter_fst.AddState()
    self.counter_fst.SetStart(first)
    states = [first]

    # Counting fsa has 2*l states (up and down for each NT)
    for i in range(2*l+1):
      states.append(self.counter_fst.AddState())

    for i in range(len(states)-1):
      #self.counter_fst.SetFinal(states[-1], 0.0)

      # Don't count words
      self.counter_fst.AddArc(states[i], fsa.simple_arc(fsa.RHO, 0.0, states[i]))
      

      #for (_, word_output) in words:
        #self.fsa.AddArc(first, StdArc(fsa.RHO, fsa.RHO, 0.0, first))
        #self.counter_fst.AddArc(states[i], StdArc(word_output, word_output, 0.0, states[i]))

      # for each non word
      #for ((s, motion), sym) in ignore_out:
        # All non words increase count by 1
        #self.counter_fst.AddArc(states[i], fsa.simple_arc(sym, 0.0, states[i+1]))

      self.counter_fst.AddArc(states[i], fsa.simple_arc(SRC_NODE, step_weight, states[i+1]))
      self.counter_fst.AddArc(states[i], fsa.simple_arc(PRE_WORD, length_penalty, states[i]))
      #self.counter_fst.SetFinal(states[i], 0.0)
#      for (edge_jump, output) in edge_jump:
        
      #self.counter_fst.AddArc(states[-2], StdArc(0, 0, 0.0, states[-1]))

      # RHO arc eats anything else
      #self.counter_fst.AddArc(states[-2], StdArc(fsa.RHO, fsa.RHO, 0.0, states[-2]))

    # Last node is a sink, so second to last is final 
    self.counter_fst.SetFinal(states[-2], 0.0)

    
    #self.counter_fst.SetNotFinal(states[-1])
    #self.counter_fst.SetNotFinal(states[-2])

    self.counter_fst.SetInputSymbols(self.s_table)
    self.counter_fst.SetOutputSymbols(self.s_table)

    return self.counter_fst