示例#1
0
 def __init__(self):
   """
   Initiate the language model
   initiate the model with weights in setting file
   """
   setting.load(["max_gram","file_lm","weight_word_penalty","weight_lm","weight_translate_costs"])
   self.maxGram = int(setting.max_gram)
   sys.stderr.write("[NovelDepStrModel] loading language model... \n")
   self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram)
   self.weight_word_penalty = setting.weight_word_penalty
   self.weight_lm = setting.weight_lm
   self.weight_translate_costs = setting.weight_translate_costs
示例#2
0
 def __init__(self):
   """
   Initiate the language model
   initiate the model with weights in setting file
   """
   setting.load(["max_gram","file_lm","weights"])
   self.maxGram = int(setting.max_gram)
   sys.stderr.write("[GentileModel] Loading language model... \n")
   self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram)
   self.weights = setting.weights
   self.lenWeights = len(self.weights)
   self.weightsForRules = self.weights[:-1]
   self.cacheLMProbs = {}
示例#3
0
class GentileModel:
  """
  class for presenting the model in Gentile Model
  """
  # { rule->cost object }
  # costs : static costs : dynamic costs : lm
  # score : costs * weights
  maxGram = None
  lm = None
  weights = None
  lenWeights = None
  weightsForRules = None
  cacheMode = False

  cacheLMProbs = None
  smode = False
  
  def __init__(self):
    """
    Initiate the language model
    initiate the model with weights in setting file
    """
    setting.load(["max_gram","file_lm","weights"])
    self.maxGram = int(setting.max_gram)
    sys.stderr.write("[GentileModel] Loading language model... \n")
    self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram)
    self.weights = setting.weights
    self.lenWeights = len(self.weights)
    self.weightsForRules = self.weights[:-1]
    self.cacheLMProbs = {}

  def calculateRuleScore(self,rule):
    """
    calculate the cost of rule
    @type rule: string
    @rtype: object
    """
    costs = rule[2]
    return reduce(lambda x,y: x+y,[costs[i]*self.weightsForRules[i] for i in range(len(costs))])
    
  def sortRules(self,rules,limit=None):
    """
    For each [rules] , calculate the cost , and return sorted rules
    but only keep [limit] rules
    @type rules: list of string
    @type limit: number
    @rtype: list of string
    """
    if not limit:
      limit = setting.size_beam
      
    list_scores = []
    list_indexes = range(len(rules))

    for rule in rules:
      list_scores.append(self.calculateRuleScore(rule))


    list_indexes.sort(key=lambda x:list_scores[x],reverse=True)
    return [rules[i] for i in list_indexes[:limit]]

  def sortHypothesises(self, hyps, limit=None):
    """
    sort hypothesises

    @type hyps: list of GentileHypothesises
    """
    if not limit:
      limit = setting.size_cube_pruning
      
    hyps.sort(key=lambda x: x.score,reverse=True)
    return hyps[:limit]

  def calculateHypothesisScore(self,hyp):
    """
    Get score of a list of costs by plus together with weights

    @type hyp: GentileHypothesis
    """
    return reduce(lambda x,y: x+y,[hyp.costs[i]*self.weights[i] for i in range(self.lenWeights)])
  
  def calculateScore(self, costs):
    """
    Calculate score of a pure cost list.
    """
    return sum([costs[i]*self.weights[i] for i in range(len(costs))])

  def getSentenseCost(self,tokens):
    """
    @type tokens: list of string
    @rtype: float
    """
    if not tokens:
      return -0.15
    
    if self.smode:
      tokens.insert(0, "<s>")
      tokens.append("</s>")

    if len(tokens) == 1 and tokens[0] == "":
      return -0.15

    return self.lm.tokensProbability(tokens)

    # if len(tokens) < self.maxGram:
    #   iters = 1
    # else:
    #   iters = len(tokens) - self.maxGram + 1
    
    # prob = 0.0

    # for ibegin in range(iters):
    #   words = tokens[ibegin:ibegin+self.maxGram]
    #   if self.cacheMode:
    #     hashWords = hash(" ".join(words))
    #     try:
    #       prob_words = self.cacheLMProbs[hashWords]
    #     except :
    #       # could not find in cache
    #       prob_words = self.lm.readNGram(words)
    #       self.cacheLMProbs[hashWords] = prob_words
    #     prob += prob_words
    #   else:
    #     prob += self.lm.readNGram(words)
    # return prob

  # def getSentenseAverageCost(self,tokens):
  #   """
  #   @type tokens: list of string
  #   @rtype: float
  #   """
  #   if len(tokens) == 1 and tokens[0] == "":
  #     return -0.15
      
  #   if len(tokens) < self.maxGram:
  #     iters = 1
  #   else:
  #     iters = len(tokens) - self.maxGram + 1
    
  #   prob = 0.0

  #   for ibegin in range(iters):
  #     words = tokens[ibegin:ibegin+self.maxGram]
  #     if self.cacheMode:
  #       hashWords = hash(" ".join(words))
  #       try:
  #         prob_words = self.cacheLMProbs[hashWords]
  #       except :
  #         # could not find in cache
  #         prob_words = self.lm.readNGram(words)
  #         self.cacheLMProbs[hashWords] = prob_words
  #       prob += prob_words
  #     else:
  #       prob += self.lm.readNGram(words)
  #   return prob/iters

  def clearCache(self):
    """
    clear lm cache
    !!! deprecated
    """
    pass
    # self.cacheLMProbs.clear()
示例#4
0
class NovelDepStrModel(Model):
  """
  class for presenting the model in Novel Dep-to-string Model
  """
  # { rule->cost object }
  # cost object : { word_penalty:,lm_cost:,       table_cost:,future_cost:,rule_cost: }
  #                 |-----no weight------|        |-----------weighted--------------|
  mapRuleCostObj = {}
  maxGram = None
  lm = None
  weight_word_penalty = None
  weight_lm = None
  weight_translate_costs = None
  def __init__(self):
    """
    Initiate the language model
    initiate the model with weights in setting file
    """
    setting.load(["max_gram","file_lm","weight_word_penalty","weight_lm","weight_translate_costs"])
    self.maxGram = int(setting.max_gram)
    sys.stderr.write("[NovelDepStrModel] loading language model... \n")
    self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram)
    self.weight_word_penalty = setting.weight_word_penalty
    self.weight_lm = setting.weight_lm
    self.weight_translate_costs = setting.weight_translate_costs

  def calcLMforRule(self,tgts):
    """
    A B C X D E X
    in 3 gram
    lcost = sum(cost of 3gram)
    fcost = sum(cost of 1gram and 2gram)
    @type tgts: list of string
    @rtype: (lcost,fcost)
    """

    ngram_start_pos = 0
    fcost,lcost = 0.0,0.0
    for i,word in enumerate(tgts):
      if word == "X":
        # reset start position when meet x
        ngram_start_pos = i+1
      elif i+1-ngram_start_pos < self.maxGram:
        # for small than max gram , calculate future cost
        fcost += self.lm.readNGram(tgts[ngram_start_pos:i+1])
      elif i+1-ngram_start_pos >= self.maxGram:
        # calculate language cost
        lcost += self.lm.readNGram(tgts[i-self.maxGram+1:i+1])
    return lcost,fcost

  def calcRuleCost(self,rule):
    """
    calculate the cost of rule
    @type rule: string
    @rtype: object
    """
    objcost = {}
    head,src,tgt,align,pfreq,probs = rule.split(" ||| ")
    tgts = tgt.split(",")

    objcost['word_penalty'] = -(len(tgts) - tgts.count("X"))

    lcost,fcost = self.calcLMforRule(tgts)
    objcost['future_cost'] = fcost*self.weight_lm
    objcost['lm_cost'] = lcost

    translate_costs = [float(p) for p in probs.split(" ")]

    objcost['table_cost'] = 0.0
    for i,cost in enumerate(translate_costs):
      objcost['table_cost'] += self.weight_translate_costs[i]*cost
      if setting.runningMode == "mert":
        objcost['tcost_'+str(i)] = self.weight_translate_costs[i]*cost

    objcost['rule_cost'] = self.weight_word_penalty*objcost['word_penalty'] + \
                        self.weight_lm*objcost['lm_cost'] + \
                        objcost['table_cost']




    return objcost
  def calcHypCost(self,hyp):
    """
    calculate the cost of hyp
    @type hyp: NovelDepStrHypothesis
    @rtype: float
    """
    return 1.0
  def sortRules(self,rules,limit=1000):
    """
    For each [rules] , calculate the cost , and return sorted rules
    but only keep [limit] rules
    @type rules: list of string
    @type limit: number
    @rtype: list of string
    """

    mapCurrentRuleCost = {}

    for rule in rules:
      hash_key = hash(rule)
      if hash_key not in self.mapRuleCostObj:
        objcost = self.calcRuleCost(rule)
        self.mapRuleCostObj[ hash_key ] = objcost
        mapCurrentRuleCost[ rule ] = objcost['rule_cost'] + objcost['future_cost']
      else:
        objcost = self.mapRuleCostObj[hash_key]
        mapCurrentRuleCost[ rule ] = objcost['rule_cost'] + objcost['future_cost']

    rules.sort(key=lambda x:mapCurrentRuleCost[x],reverse=True)
    return rules[:limit]

  def getRuleCostObj(self,rule):
    """
    @type rule: string
    @rtype: object
    """
    hash_key = hash(rule)
    if hash_key not in self.mapRuleCostObj:
      objcost = self.calcRuleCost(rule)
      self.mapRuleCostObj[ hash_key ] = objcost
      return objcost
    else:
      return self.mapRuleCostObj[hash_key]

  def getSentenseCost(self,tokens):
    """
    @type tokens: list of string
    @rtype: float
    """
    if len(tokens) < self.maxGram:
      iters = 1
    else:
      iters = len(tokens) - self.maxGram + 1
    prob = 0.0
    for ibegin in range(iters):
      prob += self.lm.readNGram(tokens[ibegin:ibegin+self.maxGram])
       
    
    return prob/iters