def subproc(pid, tasks, results, exits, lockTask): setting.runningMode = "mert" setting.load(["file_translation_input_tree","file_translation_input_dep","file_translation_output"]) decoder = GentileDecoder() while True: lockTask.acquire() if not tasks.empty(): task = tasks.get() else: task = None lockTask.release() if not task: exits.put(pid) return tid, lineTree, lineDep = task hyps = decoder.translateNBest(lineTree, lineDep) output = "" for hyp in hyps: line_output = " ||| ".join([str(tid),hyp.getTranslation(), " ".join([str(n) for n in hyp.getLambdas()]) ]) output += line_output + "\n" msg = "[%d] Got %d | %s" % (tid,len(hyps),hyps[0].getTranslation()) results.put((tid, output, msg))
def subproc(pid, tasks, results, exits, lockTask): setting.runningMode = "normal" setting.load(["file_translation_input_tree","file_translation_input_dep","file_translation_output","size_cube_pruning"]) decoder = GentileDecoder() while True: lockTask.acquire() if not tasks.empty(): task = tasks.get() else: task = None lockTask.release() if not task: exits.put(pid) return tid, lineTree, lineDep = task hyps = decoder.translateNBest(lineTree, lineDep) result = hyps[0].getTranslation() output = result + "\n" msgStream = StringIO.StringIO() hyps[0].trace(stream=msgStream) print >> msgStream, "[%d]" % tid , result msg = msgStream.getvalue() results.put((tid, output, msg))
def __init__(self): """ Load rule table files. """ setting.load(["rule_table_path","dispersion_tables"]) print "[GentileRuleTable] Loading rule table handles ..." self.ntables = setting.dispersion_tables self.ruletables = {} for itable in range(setting.dispersion_tables): self.ruletables[itable] = open("%s/rules.final.%d" % (setting.rule_table_path, itable)) print "[GentileRuleTable] Loading index tables ..." self.indextables = {} for itable in range(setting.dispersion_tables): map_index = {} path_table = "%s/index.final.%d" % (setting.rule_table_path, itable) file_index_table = open(path_table,"r") ftable = file_index_table line = ftable.readline() while line: pair = line.strip().split(" ") if len(pair) != 2: line = ftable.readline() continue hash_src, pos = pair hash_src = int(hash_src, 16) pos = int(pos) map_index[hash_src] = pos line = ftable.readline() self.indextables[itable] = map_index file_index_table.close() ftable.close()
def build_lex_stack(self,tree,ruletable,model): """ Build a stack for saving first n best lexical translation hypothesis for each node this n is usually same as the size of cube pruning @type tree: DepTreeStruct @type ruletable: NovelDepStrRuleTable @type model @rtype: object """ stack_lex = {} # for each node in tree for id in tree.nodes: if id == 0 : continue # root node word = tree.nodes[id][0] # find lexical rules of the word of this node , and sort them rules = ruletable.findByHeadAndSrcLen(word+":0",1) if len(rules) == 0: # no lex rules found # ok we need to go to lex table to find one # and construct a pseudo rule setting.load(["file_lexical_translation_table"]) if word in self.lexicalTable: # lexical rules of this word cached rules = self.lexicalTable[word] else: # not found in lexical translation table # then build pseudo rules print "[NovelDepStrDecoder] build pseudo lex rule for '%s'" % word lines = open(setting.file_lexical_translation_table).xreadlines() section_entered = False for line in lines: if line.startswith(word+" "): section_entered = True word_src,word_ref,prob = line.strip().split(" ") pseudo_rule = "%s:0 ||| %s ||| %s ||| ||| %s ||| %s %s %s %s 2.7182" \ % (word,word,word_ref,prob,prob,prob,prob,prob) rules.append(pseudo_rule) else: if section_entered == True: # all lexical rules should be created , # and now its another section here # so there should be no more rule except to be extracted break if len(rules) == 0: # still not found , then make itself be translation (maybe a number) pseudo_rule = "%s:0 ||| %s ||| %s ||| ||| 0.001 ||| 0.001 0.001 0.001 0.001 2.7182" \ % (word,word,word) rules.append(pseudo_rule) # build pseudo rules finished , got rules # cache it self.lexicalTable[word] = rules rules = model.sortRules(rules,setting.size_cube_pruning) # build hypothesises and append to lexical stack stack_lex[id] = [] for rule in rules: stack_lex[id].append( NovelDepStrHypothesis(model,(id,tree),rule) ) return stack_lex
def __init__(self): """ Initiate the language model initiate the model with weights in setting file """ setting.load(["max_gram","file_lm","weight_word_penalty","weight_lm","weight_translate_costs"]) self.maxGram = int(setting.max_gram) sys.stderr.write("[NovelDepStrModel] loading language model... \n") self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram) self.weight_word_penalty = setting.weight_word_penalty self.weight_lm = setting.weight_lm self.weight_translate_costs = setting.weight_translate_costs
def __init__(self): """ Initiate the language model initiate the model with weights in setting file """ setting.load(["max_gram","file_lm","weights"]) self.maxGram = int(setting.max_gram) sys.stderr.write("[GentileModel] Loading language model... \n") self.lm = LanguageModel(lm_file=setting.file_lm, n=setting.max_gram) self.weights = setting.weights self.lenWeights = len(self.weights) self.weightsForRules = self.weights[:-1] self.cacheLMProbs = {}
def translateNBest(self, data_tree, data_dep): """ Translate in forest @param data_tree: @param data_dep: @return: """ # Get pare tree. #self.model.cacheMode = False setting.load(["nbest", "hypothesis_cluster_limit", "head_phrases_limit"]) tree = SenseTree(data_tree,data_dep) tree.rebuildTopNode() tree.appendXToTree() tree.upMergeAllConjNodes() tree.rebuildCommaNodes() tree.convertTags() tree.separateContiniousNonTerminals() tree.buildLevelMap() # Prepare for chiropractic process. treeForest = [tree] resultStack = [] for tree in treeForest: resultStack.append(self.chiropracticTranslation(tree))
""" Gentile Sense-to-string Model. Estimator for dispersed temperate rule tables. This will create final rule tables with translation probabilities. - Raphael 2012.9 """ import sys, os, math from abraham.setting import setting setting.load(["rule_table_path", "dispersion_tables"]) class Estimator: """ A Estimator to calulate rule probabilities then produce final rules. """ nSplittingRuleTable = None pathTables = None mapTargetCount = None stackContextRules = None def __init__(self): self.mapTargetCount = {} self.nSplittingRuleTable = setting.dispersion_tables self.pathTables = setting.rule_table_path def processTargetCountTable(self,path_file):
def translateNBestOLD(self,data_tree,data_dep): """ Translate and return a N-best list @type data_tag: string @type data_dep: string @rtype: list of GentileHypothesis """ # first, we need get the tree of input self.model.cacheMode = False setting.load(["nbest", "head_phrases_limit"]) tree = SenseTree(data_tree,data_dep) tree.rebuildTopNode() tree.appendXToTree() tree.upMergeAllConjNodes() tree.rebuildCommaNodes() tree.convertTags() tree.separateContiniousNonTerminals() # tree.mergeContinuousNTs() fetcher = self.prepareRulesForTranslation(tree) # build lexical hypothesis stack # { id->[lexical hyp,] } # stack_lex = self.buildLexicalStack(fetcher) # { id->[lexical hyp,] } hypStacks = {} # for each fragment ( head node is not leaf ) at bottom-up style # use corresponding rules and basic hypothesis(lex or normal) to build normal hyp for this fragment tree.buildLevelMap() cur_level = tree.getMaxLevel() # A dirty trick: save current sense tree to cross-module global variable. __builtin__.currentSenseTree = tree # start pruning self.model.cacheMode = True while cur_level > 0: # [head id,] nodes_cur_level = tree.getNodesByLevel(cur_level) if cur_level == 1: self.model.smode = True else: self.model.smode = False for node in nodes_cur_level: if node not in fetcher.joints: # only prune for joint nodes continue # get rules rules, sitesInvolved = fetcher.mapJointRules[node] # okay available could in random order # we dont need sort it if not rules: # No rules found, force to use CYK. rc = Reconstructor(self.ruletable, self.model, tree, hypStacks, node) hyps = rc.parse() else: # Rules found then cube prunning. # sort rules rules = self.model.sortRules(rules) # now run the cube pruning and get normal hypothesises for current node hyps = separately_prune(self.model, node, rules, sitesInvolved, hypStacks) hypStacks[node] = hyps self.model.clearCache() # end of current node cur_level -= 1 rootNode = tree.getRootNode() if rootNode not in hypStacks or len(hypStacks[rootNode])==0: # failed print "[GentileDecoder]","Translation Failed!!!" return [] # end building normal hypothesis stack # hypStacks[rootNode][0].trace() return hypStacks[rootNode][:setting.nbest]
import os import sys from abraham.treestruct import DepTreeStruct from abraham.setting import setting from gentile.tagconvertor import convert_tags_for_tokens setting.load(["x_as_tag"]) PATTERN_SEPARATE_NTS = "NP NP,NP NP NP,NP VP,NP VBZ,NP NP VBZ,S VBZ,DT X".split(",") class GeneralTree: """ A class represents gernal tree structure. """ nodes = None mapParent = None mapChildren = None root = None def __init__(self): """ Initialize members. """ self.nodes, self.mapParent, self.mapChildren = {}, {}, {} def node(self, id): """ Get the node by given node id. """ return self.nodes[id] if id in self.nodes else None
def translateStanfordNBest(self,data_tag,data_dep): """ Translate and return a N-best list @type data_tag: string @type data_dep: string @rtype: list of NovelDepStrHypothesis """ # first, we need get the tree of input setting.load(["nbest"]) tree = DepTreeStruct(data_tag,data_dep) ruletable = self.ruletable model = self.model # build lexical hypothesis stack # { id->[lexical hyp,] } stack_lex = self.build_lex_stack(tree,ruletable,model) # { id->[lexical hyp,] } stack_normal = {} # for each fragment ( head node is not leaf ) at bottom-up style # use corresponding rules and basic hypothesis(lex or normal) to build normal hyp for this fragment cur_level = tree.getMaxLevel()-1 while cur_level > 0: # [head id,] nodes_cur_level = tree.getNodesByLevel(cur_level) for headid in nodes_cur_level: # only build normal hypothesises for internal nodes if tree.isLeafNode(headid) : continue # build rule list cur_fragment = (headid,tree) rules = ruletable.findByFragment(cur_fragment) if len(rules) == 0: # rules not found !!! rules = [ruletable.buildPsuedoRule(cur_fragment)] # [(rule,cost),...] stack_rules = model.sortRules(rules,setting.size_cube_pruning) # build hypothesis stacks for cube pruning # head node : lexical hypothesises # internal nodes : normal hypothesises # leaf nodes : lexical hypothesises # [[hyp,...],...] stacks_pruning = [] # append lexical hypothesis stack for head node stacks_pruning.append(stack_lex[headid]) # add other hypothesis stacks for nodeid in tree.getChildNodes(headid): if tree.isLeafNode(nodeid): # leaf node stacks_pruning.append(stack_lex[nodeid]) else: # internal node assert nodeid in stack_normal stacks_pruning.append(stack_normal[nodeid]) # now run the cube pruning and get normal hypothesises for current node pruner = AllInOneCubePruner(model,cur_fragment,stack_rules,stacks_pruning,setting.size_cube_pruning) list_hyps = pruner.prune() # sort pruned hyps stack_normal[headid] = list_hyps # end of current node cur_level -= 1 if tree.headNodeId not in stack_normal or len(stack_normal[tree.headNodeId])==0: # failed print "NovelDepStrDecoder","Translation Failed!!!" return [] # end building normal hypothesis stack stack_normal[tree.headNodeId][0].trace() return stack_normal[tree.headNodeId][:setting.nbest]
sys.path += ["%s/abraham" % os.path.dirname(os.path.abspath(__file__))] from abraham.setting import setting from gentile.decoder import GentileDecoder if __name__ == "__main__": #sys.argv.append ("config.yaml") #sys.argv.append ("mert") arg_length = len(sys.argv) if arg_length == 1: # abraham.py print "usage : python abraham.py config.yaml" elif arg_length == 2: # abraham.py config.yaml setting.runningMode = "normal" setting.load(["file_translation_input_tree","file_translation_input_dep","file_translation_output","size_cube_pruning"]) decoder = GentileDecoder() print "[Gentile]", "Interactive Mode" while True: sentence = raw_input("[INPUT]") sentence = sentence.strip() _, pathText = tempfile.mkstemp() _, pathCFG = tempfile.mkstemp() _, pathDep = tempfile.mkstemp()
the hot X1 went ||| SONO ATSUI X1 NAKU NATTA ||| NN ||| 0-0 1-1 ||| 0.125 - Raphael 2012.8 """ import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import re import math from heapq import merge from abraham.setting import setting from sense import SenseTree, GeneralTree import itertools setting.load(["max_merge_nodes", "max_tokens"]) class Extractor: """ Rule extractor for Gentile. """ contentNodeSet = None sense = None """ @type: SenseTree """ tree = None """ @type: GeneralTree """ targets = None """ @type: list of string """ mapAlignment = None """ @type: dict """
""" Gentile, sense tree-to-string model. Rule extractor. - Raphael 2012.8 """ import sys, os from abraham.setting import setting from gentile.extractor import Extractor from gentile.sense import SenseTree #sys.argv.append("config.yaml") setting.load(["file_source_tree","file_source_dep", "file_target","file_alignment","rule_table_path"]) if len(sys.argv) == 7: _,FILE_SOURCE_TREE,FILE_SOURCE_DEP,FILE_TARGET,FILE_ALIGNMENT,PATH_RULETABLES,_ = sys.argv else: FILE_SOURCE_TREE = setting.file_source_tree FILE_SOURCE_DEP = setting.file_source_dep FILE_TARGET = setting.file_target FILE_ALIGNMENT = setting.file_alignment PATH_RULETABLES = setting.rule_table_path linesTree = open(FILE_SOURCE_TREE).readlines() linesDep = open(FILE_SOURCE_DEP).read().split("\n\n") linesTarget = open(FILE_TARGET).readlines() linesAlignment = open(FILE_ALIGNMENT).readlines() print "[GENTILE] Extracting ..."
from abraham.setting import setting from gentile.decoder import GentileDecoder from chiropractor.decoder import ChiropracticDecoder if __name__ == "__main__": #sys.argv.append ("config.yaml") #sys.argv.append ("mert") arg_length = len(sys.argv) if arg_length == 1: # abraham.py print "usage : python abraham.py config.yaml" elif arg_length == 2: # abraham.py config.yaml setting.runningMode = "normal" setting.load(["enable_chiropractic","file_translation_input_tree","file_translation_input_dep","file_translation_output","size_cube_pruning"]) linesDep = open(setting.file_translation_input_dep).read().split("\n\n") linesTree = open(setting.file_translation_input_tree).readlines() decoder = setting.enable_chiropractic and ChiropracticDecoder() or GentileDecoder() foutput = open(setting.file_translation_output, "w") print "[Abraham]","translate %d sentences..." % (len(linesTree)) for i in range(len(linesTree)): lineTree = linesTree[i].strip() lineDep = linesDep[i].strip() hyps = decoder.translateNBest(lineTree, lineDep) # hyps[0].trace() # if len(hyps)==0: # print "[%d]" % i , "Translation Failed!!!"
import sys, os from abraham.setting import setting setting.load(["rule_table_path"]) if len(sys.argv) != 2: print "python auto-do-all.py [config.yaml]" sys.exit() _, cfg = sys.argv if not os.path.exists(cfg): print "config not exist" sys.exit() def run(cmd): print cmd os.system(cmd) desc = cfg.replace("config.", "").replace(".yaml", "") run("python extractor.gentile.py %s" % cfg) # Filter extracted rules ruletable = "%s/rules.extracted" % setting.rule_table_path run("mv %s %s.withdtcomma" % (ruletable, ruletable)) run("python research/remove-dt-to-comma-rules.py %s.withdtcomma > %s" % (ruletable, ruletable)) run("python estimator.gentile.py --disperse %s" % cfg) run("python estimator.gentile.py --estimate %s" % cfg) run("python index.gentile.py %s" % cfg) #run("python gentile.m.py %s > research/%s.log" % (cfg, desc)) run("date")