def __init__(self): """ Initiate the language model initiate the model with weights in setting file """ setting.load(["max_gram","file_lm","weight_word_penalty","weight_lm","weight_translate_costs"]) self.maxGram = int(setting.max_gram) sys.stderr.write("[NovelDepStrModel] loading language model... \n") self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram) self.weight_word_penalty = setting.weight_word_penalty self.weight_lm = setting.weight_lm self.weight_translate_costs = setting.weight_translate_costs
def __init__(self): """ Initiate the language model initiate the model with weights in setting file """ setting.load(["max_gram","file_lm","weights"]) self.maxGram = int(setting.max_gram) sys.stderr.write("[GentileModel] Loading language model... \n") self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram) self.weights = setting.weights self.lenWeights = len(self.weights) self.weightsForRules = self.weights[:-1] self.cacheLMProbs = {}
class GentileModel: """ class for presenting the model in Gentile Model """ # { rule->cost object } # costs : static costs : dynamic costs : lm # score : costs * weights maxGram = None lm = None weights = None lenWeights = None weightsForRules = None cacheMode = False cacheLMProbs = None smode = False def __init__(self): """ Initiate the language model initiate the model with weights in setting file """ setting.load(["max_gram","file_lm","weights"]) self.maxGram = int(setting.max_gram) sys.stderr.write("[GentileModel] Loading language model... \n") self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram) self.weights = setting.weights self.lenWeights = len(self.weights) self.weightsForRules = self.weights[:-1] self.cacheLMProbs = {} def calculateRuleScore(self,rule): """ calculate the cost of rule @type rule: string @rtype: object """ costs = rule[2] return reduce(lambda x,y: x+y,[costs[i]*self.weightsForRules[i] for i in range(len(costs))]) def sortRules(self,rules,limit=None): """ For each [rules] , calculate the cost , and return sorted rules but only keep [limit] rules @type rules: list of string @type limit: number @rtype: list of string """ if not limit: limit = setting.size_beam list_scores = [] list_indexes = range(len(rules)) for rule in rules: list_scores.append(self.calculateRuleScore(rule)) list_indexes.sort(key=lambda x:list_scores[x],reverse=True) return [rules[i] for i in list_indexes[:limit]] def sortHypothesises(self, hyps, limit=None): """ sort hypothesises @type hyps: list of GentileHypothesises """ if not limit: limit = setting.size_cube_pruning hyps.sort(key=lambda x: x.score,reverse=True) return hyps[:limit] def calculateHypothesisScore(self,hyp): """ Get score of a list of costs by plus together with weights @type hyp: GentileHypothesis """ return reduce(lambda x,y: x+y,[hyp.costs[i]*self.weights[i] for i in range(self.lenWeights)]) def calculateScore(self, costs): """ Calculate score of a pure cost list. """ return sum([costs[i]*self.weights[i] for i in range(len(costs))]) def getSentenseCost(self,tokens): """ @type tokens: list of string @rtype: float """ if not tokens: return -0.15 if self.smode: tokens.insert(0, "<s>") tokens.append("</s>") if len(tokens) == 1 and tokens[0] == "": return -0.15 return self.lm.tokensProbability(tokens) # if len(tokens) < self.maxGram: # iters = 1 # else: # iters = len(tokens) - self.maxGram + 1 # prob = 0.0 # for ibegin in range(iters): # words = tokens[ibegin:ibegin+self.maxGram] # if self.cacheMode: # hashWords = hash(" ".join(words)) # try: # prob_words = self.cacheLMProbs[hashWords] # except : # # could not find in cache # prob_words = self.lm.readNGram(words) # self.cacheLMProbs[hashWords] = prob_words # prob += prob_words # else: # prob += self.lm.readNGram(words) # return prob # def getSentenseAverageCost(self,tokens): # """ # @type tokens: list of string # @rtype: float # """ # if len(tokens) == 1 and tokens[0] == "": # return -0.15 # if len(tokens) < self.maxGram: # iters = 1 # else: # iters = len(tokens) - self.maxGram + 1 # prob = 0.0 # for ibegin in range(iters): # words = tokens[ibegin:ibegin+self.maxGram] # if self.cacheMode: # hashWords = hash(" ".join(words)) # try: # prob_words = self.cacheLMProbs[hashWords] # except : # # could not find in cache # prob_words = self.lm.readNGram(words) # self.cacheLMProbs[hashWords] = prob_words # prob += prob_words # else: # prob += self.lm.readNGram(words) # return prob/iters def clearCache(self): """ clear lm cache !!! deprecated """ pass # self.cacheLMProbs.clear()
class NovelDepStrModel(Model): """ class for presenting the model in Novel Dep-to-string Model """ # { rule->cost object } # cost object : { word_penalty:,lm_cost:, table_cost:,future_cost:,rule_cost: } # |-----no weight------| |-----------weighted--------------| mapRuleCostObj = {} maxGram = None lm = None weight_word_penalty = None weight_lm = None weight_translate_costs = None def __init__(self): """ Initiate the language model initiate the model with weights in setting file """ setting.load(["max_gram","file_lm","weight_word_penalty","weight_lm","weight_translate_costs"]) self.maxGram = int(setting.max_gram) sys.stderr.write("[NovelDepStrModel] loading language model... \n") self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram) self.weight_word_penalty = setting.weight_word_penalty self.weight_lm = setting.weight_lm self.weight_translate_costs = setting.weight_translate_costs def calcLMforRule(self,tgts): """ A B C X D E X in 3 gram lcost = sum(cost of 3gram) fcost = sum(cost of 1gram and 2gram) @type tgts: list of string @rtype: (lcost,fcost) """ ngram_start_pos = 0 fcost,lcost = 0.0,0.0 for i,word in enumerate(tgts): if word == "X": # reset start position when meet x ngram_start_pos = i+1 elif i+1-ngram_start_pos < self.maxGram: # for small than max gram , calculate future cost fcost += self.lm.readNGram(tgts[ngram_start_pos:i+1]) elif i+1-ngram_start_pos >= self.maxGram: # calculate language cost lcost += self.lm.readNGram(tgts[i-self.maxGram+1:i+1]) return lcost,fcost def calcRuleCost(self,rule): """ calculate the cost of rule @type rule: string @rtype: object """ objcost = {} head,src,tgt,align,pfreq,probs = rule.split(" ||| ") tgts = tgt.split(",") objcost['word_penalty'] = -(len(tgts) - tgts.count("X")) lcost,fcost = self.calcLMforRule(tgts) objcost['future_cost'] = fcost*self.weight_lm objcost['lm_cost'] = lcost translate_costs = [float(p) for p in probs.split(" ")] objcost['table_cost'] = 0.0 for i,cost in enumerate(translate_costs): objcost['table_cost'] += self.weight_translate_costs[i]*cost if setting.runningMode == "mert": objcost['tcost_'+str(i)] = self.weight_translate_costs[i]*cost objcost['rule_cost'] = self.weight_word_penalty*objcost['word_penalty'] + \ self.weight_lm*objcost['lm_cost'] + \ objcost['table_cost'] return objcost def calcHypCost(self,hyp): """ calculate the cost of hyp @type hyp: NovelDepStrHypothesis @rtype: float """ return 1.0 def sortRules(self,rules,limit=1000): """ For each [rules] , calculate the cost , and return sorted rules but only keep [limit] rules @type rules: list of string @type limit: number @rtype: list of string """ mapCurrentRuleCost = {} for rule in rules: hash_key = hash(rule) if hash_key not in self.mapRuleCostObj: objcost = self.calcRuleCost(rule) self.mapRuleCostObj[ hash_key ] = objcost mapCurrentRuleCost[ rule ] = objcost['rule_cost'] + objcost['future_cost'] else: objcost = self.mapRuleCostObj[hash_key] mapCurrentRuleCost[ rule ] = objcost['rule_cost'] + objcost['future_cost'] rules.sort(key=lambda x:mapCurrentRuleCost[x],reverse=True) return rules[:limit] def getRuleCostObj(self,rule): """ @type rule: string @rtype: object """ hash_key = hash(rule) if hash_key not in self.mapRuleCostObj: objcost = self.calcRuleCost(rule) self.mapRuleCostObj[ hash_key ] = objcost return objcost else: return self.mapRuleCostObj[hash_key] def getSentenseCost(self,tokens): """ @type tokens: list of string @rtype: float """ if len(tokens) < self.maxGram: iters = 1 else: iters = len(tokens) - self.maxGram + 1 prob = 0.0 for ibegin in range(iters): prob += self.lm.readNGram(tokens[ibegin:ibegin+self.maxGram]) return prob/iters