class Decoder: """ Class that actually do the translation job. present the translation model described in Gentile Model. """ ruletable = None model = None """@type: GentileModel""" lexicalTable = None def __init__(self): """ Load rule table and language model once !!! """ self.ruletable = GentileRuleTable() self.model = GentileModel() def translate(self,data_tree,data_dep): """ translate the data in format of stanford parser (tag,basicDependency) @type data_tag: string @type data_dep: string @rtype: string """ return self.translateNBest(data_tree,data_dep)[0].getTranslation() def prepareRulesForTranslation(self,tree): """ Decide joint node for pruning, and fetch rules for each joint node. @type tree: SenseTree @rtype: GentileRuleFetcher """ fetcher = GentileRuleFetcher(tree, self.ruletable, self.model) fetcher.fetch() return fetcher def buildLexicalStack(self,fetcher): """ For each joint nodes, create lexical hypothesises for it iff lexical rules for these nodes exist. @type fetcher: GentileRuleFetcher """ stack_lex = {} for node in fetcher.joints: lexrules = fetcher.mapJointRules[node][1] if lexrules: # create hypothesises using these lexical rules hyps = [GentileHypothesis(self.model,lexrule,{}) for lexrule in lexrules] hyps = self.model.sortHypothesises(hyps) stack_lex[node] = hyps return stack_lex def translateNBest(self,data_tree,data_dep): """ Translate and return a N-best list @type data_tag: string @type data_dep: string @rtype: list of GentileHypothesis """ # first, we need get the tree of input self.model.cacheMode = False setting.load(["nbest"]) tree = SenseTree(data_tree,data_dep) tree.rebuildTopNode() tree.appendXToTree() tree.upMergeAllConjNodes() tree.rebuildCommaNodes() tree.convertTags() tree.separateContiniousNonTerminals() # tree.mergeContinuousNTs() fetcher = self.prepareRulesForTranslation(tree) # build lexical hypothesis stack # { id->[lexical hyp,] } # stack_lex = self.buildLexicalStack(fetcher) # { id->[lexical hyp,] } hypStacks = {} # for each fragment ( head node is not leaf ) at bottom-up style # use corresponding rules and basic hypothesis(lex or normal) to build normal hyp for this fragment tree.buildLevelMap() cur_level = tree.getMaxLevel() # A dirty trick: save current sense tree to cross-module global variable. __builtin__.currentSenseTree = tree # start pruning self.model.cacheMode = True while cur_level > 0: # [head id,] nodes_cur_level = tree.getNodesByLevel(cur_level) if cur_level == 1: self.model.smode = True else: self.model.smode = False for node in nodes_cur_level: if node not in fetcher.joints: # only prune for joint nodes continue # get rules rules, sitesInvolved = fetcher.mapJointRules[node] # okay available could in random order # we dont need sort it if not rules: # No rules found, force to use CYK. rc = Reconstructor(self.ruletable, self.model, tree, hypStacks, node) hyps = rc.parse() else: # Rules found then cube prunning. # sort rules rules = self.model.sortRules(rules) # now run the cube pruning and get normal hypothesises for current node hyps = separately_prune(self.model, node, rules, sitesInvolved, hypStacks) hypStacks[node] = hyps self.model.clearCache() # end of current node cur_level -= 1 rootNode = tree.getRootNode() if rootNode not in hypStacks or len(hypStacks[rootNode])==0: # failed print "[GentileDecoder]","Translation Failed!!!" return [] # end building normal hypothesis stack # hypStacks[rootNode][0].trace() return hypStacks[rootNode][:setting.nbest]
class ChiropracticDecoder: """ Class that actually do the translation job. present the translation model described in Gentile Model. """ rulefetcher = None """@type: RuleFetcher""" ruletable = None model = None """@type: GentileModel""" lexicalTable = None def __init__(self): """ Load rule table and language model once !!! """ self.ruletable = GentileRuleTable() self.model = GentileModel() self.rulefetcher = RuleFetcher(self.ruletable, self.model) def translate(self,data_tree,data_dep): """ translate the data in format of stanford parser (tag,basicDependency) @type data_tag: string @type data_dep: string @rtype: string """ return self.translateNBest(data_tree,data_dep)[0].getTranslation() def buildLexicalStack(self,fetcher): """ For each joint nodes, create lexical hypothesises for it iff lexical rules for these nodes exist. @type fetcher: GentileRuleFetcher """ stack_lex = {} for node in fetcher.joints: lexrules = fetcher.mapJointRules[node][1] if lexrules: # create hypothesises using these lexical rules hyps = [GentileHypothesis(self.model,lexrule,{}) for lexrule in lexrules] hyps = self.model.sortHypothesises(hyps) stack_lex[node] = hyps return stack_lex def translateNBestOLD(self,data_tree,data_dep): """ Translate and return a N-best list @type data_tag: string @type data_dep: string @rtype: list of GentileHypothesis """ # first, we need get the tree of input self.model.cacheMode = False setting.load(["nbest", "head_phrases_limit"]) tree = SenseTree(data_tree,data_dep) tree.rebuildTopNode() tree.appendXToTree() tree.upMergeAllConjNodes() tree.rebuildCommaNodes() tree.convertTags() tree.separateContiniousNonTerminals() # tree.mergeContinuousNTs() fetcher = self.prepareRulesForTranslation(tree) # build lexical hypothesis stack # { id->[lexical hyp,] } # stack_lex = self.buildLexicalStack(fetcher) # { id->[lexical hyp,] } hypStacks = {} # for each fragment ( head node is not leaf ) at bottom-up style # use corresponding rules and basic hypothesis(lex or normal) to build normal hyp for this fragment tree.buildLevelMap() cur_level = tree.getMaxLevel() # A dirty trick: save current sense tree to cross-module global variable. __builtin__.currentSenseTree = tree # start pruning self.model.cacheMode = True while cur_level > 0: # [head id,] nodes_cur_level = tree.getNodesByLevel(cur_level) if cur_level == 1: self.model.smode = True else: self.model.smode = False for node in nodes_cur_level: if node not in fetcher.joints: # only prune for joint nodes continue # get rules rules, sitesInvolved = fetcher.mapJointRules[node] # okay available could in random order # we dont need sort it if not rules: # No rules found, force to use CYK. rc = Reconstructor(self.ruletable, self.model, tree, hypStacks, node) hyps = rc.parse() else: # Rules found then cube prunning. # sort rules rules = self.model.sortRules(rules) # now run the cube pruning and get normal hypothesises for current node hyps = separately_prune(self.model, node, rules, sitesInvolved, hypStacks) hypStacks[node] = hyps self.model.clearCache() # end of current node cur_level -= 1 rootNode = tree.getRootNode() if rootNode not in hypStacks or len(hypStacks[rootNode])==0: # failed print "[GentileDecoder]","Translation Failed!!!" return [] # end building normal hypothesis stack # hypStacks[rootNode][0].trace() return hypStacks[rootNode][:setting.nbest] def translateNBest(self, data_tree, data_dep): """ Translate in forest @param data_tree: @param data_dep: @return: """ # Get pare tree. #self.model.cacheMode = False setting.load(["nbest", "hypothesis_cluster_limit", "head_phrases_limit"]) tree = SenseTree(data_tree,data_dep) tree.rebuildTopNode() tree.appendXToTree() tree.upMergeAllConjNodes() tree.rebuildCommaNodes() tree.convertTags() tree.separateContiniousNonTerminals() tree.buildLevelMap() # Prepare for chiropractic process. # treeForest = [tree] # resultStack = [] # for tree in treeForest: # resultStack.append(self.chiropracticTranslation(tree)) return self.chiropracticTranslation(tree) def buildPhraseBorders(self, sense): """ Build a map saving borders for each phrase, each phrase is identified by its ID @type sense: SenseTree @return: map node id - > (leftBorder, rightBorder) """ borders = {} for node in sense.tree.nodes: nodeTokens = sense.tree.nodes[node] terminals = [t for t in nodeTokens if t > 0] borders[node] = (min(terminals), max(terminals)) return borders def buildPhraseClosedTokens(self, sense, borders): """ Generate closed tokens for each phrase. @param sense: @return: """ closedTokens = {} for node in borders: firstTerminal, lastTerminal = borders[node] nodeTokens = sense.tree.nodes[node] firstTerminalPos, lastTerminalPos = nodeTokens.index(firstTerminal), nodeTokens.index(lastTerminal) closedTokens[node] = nodeTokens[firstTerminalPos : lastTerminalPos+1] return closedTokens def buildPhraseCoverages(self, sense): """ Generate a map saving word coverages for each phrase. @type sense: SenseTree @return: """ phraseCoverages = {} currentLevel = sense.getMaxLevel() while currentLevel > 0: nodes = sense.getNodesByLevel(currentLevel) for node in nodes: terminals = [t for t in sense.tree.nodes[node] if t > 0] nonTerminals = [t for t in sense.tree.nodes[node] if t < 0] for nonTerminal in nonTerminals: terminals.extend(phraseCoverages[-nonTerminal]) phraseCoverages[node] = terminals currentLevel -= 1 for phrase in phraseCoverages: phraseCoverages[phrase] = (min(phraseCoverages[phrase]), max(phraseCoverages[phrase])) return phraseCoverages def buildPhraseDependencies(self, sense, borders): """ Generate phrase dependencies for a given parse tree. These special phrase dependencies are yield by a internal X in a phrase. @param tree: parse tree @type tree: SenseTree @return: map node id -> [ area, ... ] """ phraseDependencies = {} for node, nodeTokens in sense.tree.nodes.items(): firstTerminal, lastTerminal = borders[node] firstTerminalPos, lastTerminalPos = nodeTokens.index(firstTerminal), nodeTokens.index(lastTerminal) tokensFilledWithTerminals = nodeTokens[firstTerminalPos : lastTerminalPos+1] internalNonTerminals = [t for t in tokensFilledWithTerminals if t < 0] if internalNonTerminals: phraseDependencies[node] = [] for nonTerminal in internalNonTerminals: pos = nodeTokens.index(nonTerminal) previousTerminal = nodeTokens[pos - 1] nextTerminal = nodeTokens[pos + 1] area = (previousTerminal + 1, nextTerminal - 1) phraseDependencies[node].append(area) return phraseDependencies def buildPhraseDerivations(self, borders, dependencies): """ Build a map indicates phrase derivation relationships. For each phrase, it generates all derivations (may be derivation of derivation). @param sense: @param borders: @param dependencies: @return: [ (phrase, derivation), ... ] """ phraseDerivations = [] for node, dependentAreas in dependencies.items(): for area in dependentAreas: leftAreaBorder, rightAreaBorder = area for childNode, childNodeBorder in borders.items(): leftBorder, rightBorder = childNodeBorder if leftBorder >= leftAreaBorder and rightBorder <= rightAreaBorder: phraseDerivations.append((childNode, node)) return phraseDerivations def buildPhraseHeadWords(self, sense): """ For each phrase, determine the head word for it. @type sense: SenseTree @return: """ def buildIntrinsicCoverageSourceMap(self, sense, phraseCoverages): """ Build a map saves coverage for each rule. @param sense: parsed tree @type sense: SenseTree @return: """ intrinsicCoverageSourceMap = {} for node in sense.tree.nodes: if node not in phraseCoverages: continue area = phraseCoverages[node] # Build source source = [] for token in sense.tree.nodes[node]: if token > 0: # Terminal token source.append(token) else: # Non-terminal token source.append(phraseCoverages[-token]) intrinsicCoverageSourceMap[area] = source return intrinsicCoverageSourceMap def chiropracticTranslation(self, sense): """ Do chiropractic translation for a given phrase combination @type sense: SenseTree @return: """ phraseBorders = self.buildPhraseBorders(sense) phraseDependencies = self.buildPhraseDependencies(sense, phraseBorders) phraseDerivations = self.buildPhraseDerivations(phraseBorders, phraseDependencies) phraseCoverages = self.buildPhraseCoverages(sense) phraseClosedTokens = self.buildPhraseClosedTokens(sense, phraseBorders) intrinsicCoverageSourceMap = self.buildIntrinsicCoverageSourceMap(sense, phraseCoverages) intrinsicCoverageMap = self.buildIntrinsicCoverageMap(sense, phraseCoverages) # phraseHeadWords = sense.mapNodeToMainToken # For each area from short to long do forest decoding areas = set() map(areas.update, phraseDependencies.values()) fullArea = (1, len(sense.tokens)) areas.add(fullArea) # Sort areas by length # area's border begins with 1 areas = list(areas) areas.sort(key = lambda x : x[1] - x[0]) hypStacks = {} areaTags = {} maxTokenId = len(sense.tokens) for area in areas: self.disableInpossibleCells(hypStacks, area, maxTokenId) self.buildHypothesisesForArea(sense, hypStacks, area, areaTags, phraseBorders, phraseDerivations, intrinsicCoverageMap, phraseClosedTokens, intrinsicCoverageSourceMap) assert fullArea in hypStacks print hypStacks[fullArea][0].translation return hypStacks[fullArea][:setting.nbest] def disableInpossibleCells(self, hypStacks, area, maxTokenId): """ As the area is going to be decoded separately, any upper cell includes only part of this area should be disabled. ----------- ----------- | | | | | | | |x|x| | | ----------- ----------- |o|o|o| | |o|o|o| | --------- => --------- |o|o| | |o|o|x| ------- ------- |o| | |o|x| ----- ----- | | | | --- --- @param hypStack: @param area: @return: """ leftBorder, rightBorder = area # If length is 1, do nothing if leftBorder == rightBorder: return for right in range(leftBorder, rightBorder): for left in range(1, leftBorder): hypStacks[(left, right)] = [] for left in range(leftBorder + 1, rightBorder + 1): for right in range(rightBorder + 1, maxTokenId + 1): hypStacks[(left, right)] = [] def enumerateBasePhrasesForArea(self, area, phraseBorders, phraseDerivations): """ Enumerate all base phrases for given area, any base phrase should not depend on another base phrase. @param area: @param phraseBorders: @param phraseDerivations: @return: """ leftAreaBorder, rightAreaBorder = area candidates = [phrase for phrase in phraseBorders if phraseBorders[phrase][0] >= leftAreaBorder and phraseBorders[phrase][1] <= rightAreaBorder ] for dependentPhrase, phrase in phraseDerivations: if dependentPhrase in candidates and phrase in candidates: candidates.remove(dependentPhrase) # Sort in the order of word candidates.sort(key=lambda p: phraseBorders[p][0] ) return candidates def listHeadPhrases(self, phraseGroup, phraseBorders, phraseCoverages): """ List possible head phrases from phrase group, if phrases more than limit found, prune it with coverage similarity. @param phraseGroup: @param phraseBorders: @param phraseCoverages: @return: """ candidates = list(phraseGroup) if len(candidates) <= setting.head_phrases_limit: return candidates coverageLength = phraseBorders[phraseGroup[-1]][1] - phraseBorders[phraseGroup[0]][0] candidates.sort(key=lambda p: abs(phraseCoverages[p][1] - phraseCoverages[p][0] - coverageLength)) return candidates[:setting.head_phrases_limit] def buildSource(self, phraseGroup, phraseBorders, phraseClosedTokens, headPhrase): """ Generate source for given head phrase in the situation of given phrase group. @param sense: @param phraseGroup: @param phraseClosedTokens: @param headPhrase: @return: source """ source = [] headPhrasePosition = phraseGroup.index(headPhrase) # Add left part if headPhrasePosition > 0: source.append((phraseBorders[phraseGroup[0]][0], phraseBorders[phraseGroup[headPhrasePosition - 1]][1],)) # Add part of head phrase for pos, token in enumerate(phraseClosedTokens[headPhrase]): if token > 0: # Terminals source.append(token) else: # Non-terminals, add dependent coverage source.append((phraseClosedTokens[pos-1] + 1, phraseClosedTokens[pos+1] - 1)) # Add right part if headPhrasePosition < len(phraseGroup) - 1: source.append((phraseBorders[phraseGroup[headPhrasePosition + 1]][0], phraseBorders[phraseGroup[len(phraseGroup) - 1]][1],)) return source def selectTopHypClusterCombinations(hypStacks, dependentCells, limit): """ Assume all dependent cells have hypothesises. Select hypothesis cluster combinations from given dependent cells with limit. Actually, this algorithm is a simple cube pruning. Each hyp stack should has a list with key "TAGS" saving possible tags sorted by highest hyp score. @param hypStacks: @param dependentCells: @param limit: @return: """ combinations = [] combinations.append([hypStacks[c]['TAGS'][0] for c in dependentCells]) # Initiate indicator length = len(dependentCells) indicator = [0] * length hypStackLengths = [len(hypStacks[c]['TAGS']) for c in dependentCells] fullfilled = False while True: possiblePositions = [p for p in range(length) if indicator[p] < hypStackLengths[p] - 1] for combLength in range(1, len(possiblePositions) + 1): for posPlusComb in itertools.combinations(possiblePositions, combLength): combination = [] for pos in range(length): if pos in posPlusComb: combination.append(hypStacks[dependentCells[pos]]['TAGS'][indicator[pos]+1]) else: combination.append(hypStacks[dependentCells[pos]]['TAGS'][indicator[pos]]) print "[C]", combination combinations.append(combination) if len(combinations) >= limit: fullfilled = True break if fullfilled: break if fullfilled: break # Accumulate all sub indicators didAccumulation = False for i in range(length): if indicator[i] < hypStackLengths[i] - 1: indicator[i] += 1 didAccumulation = True print indicator, hypStackLengths if not didAccumulation: # No more combinations break return combinations def generateSources(self, phraseGroup, phraseClosedTokens, phraseBorders): """ @param sense: @param phraseGroup: @return: """ combinedPhraseLimit = 3 pruningSourceLimit = 1000 # Generate bit patterns bitPatterns = [] for combinedPhraseLength in range(1, combinedPhraseLimit + 1): if len(bitPatterns) >= pruningSourceLimit: break bitPattern = sum([2 ** n for n in range(combinedPhraseLength)]) for leftmoveLength in range(len(phraseGroup) - combinedPhraseLength + 1): if len(bitPatterns) >= pruningSourceLimit: break bitPatterns.append(bitPattern << leftmoveLength) # Generate sources sourcesWithPhrase = [] rangePhrasePositions = range(len(phraseGroup)) for bitPattern in bitPatterns: rawSource = [] # phrases = [] for iPhrase in rangePhrasePositions: if (bitPattern >> iPhrase) % 2: # In this bit, the pattern holds 1, means terminal phrase rawSource.extend(phraseClosedTokens[phraseGroup[iPhrase]]) # phrases.append(phraseGroup[iPhrase]) else: # In this bit, the pattern holds 0, means one of non-terminal rawSource.append(-phraseGroup[iPhrase]) # Convert phrase id to area currentNT = None source = [] for pos in range(len(rawSource)): if rawSource[pos] < 0: # This position holds non-terminal if not currentNT: currentNT = phraseBorders[-rawSource[pos]] else: currentNT = (currentNT[0], phraseBorders[-rawSource[pos]][1]) else: # This position holds terminal if currentNT: source.append(currentNT) currentNT = None source.append(rawSource[pos]) if currentNT: source.append(currentNT) sourcesWithPhrase.append(source) return sourcesWithPhrase def buildSourceString(self, sense, areaTags, source): """ Convert source to string. @param sense: hierarchical phrase tree @type sense: SenseTree @param areaTags: @param source: @return: """ stringParts = [] for part in source: if isinstance(part, tuple): # A area for non-terminals if part not in areaTags: return None stringParts.append("[%s]" % areaTags[part]) else: # A token id for terminals stringParts.append(sense.tokens[part - 1][1]) return " ".join(stringParts) def buildIntrinsicCoverageMap(self, sense, phraseCoverages): """ @param sense: @param phraseCoverages: @return: """ intrinsicCoverageMap = {} for node in sense.tree.nodes: if node not in phraseCoverages: continue if node not in intrinsicCoverageMap: intrinsicCoverageMap[phraseCoverages[node]] = [] childNodes = sense.tree.children(node) for childNode in childNodes: if childNode not in phraseCoverages: continue intrinsicCoverageMap[phraseCoverages[node]].append(phraseCoverages[childNode]) return intrinsicCoverageMap def getSubTreeDistance(self, intrinsicCoverageMap, area, dependentAreas): """ Build sub tree distance for given source. Currently, this implementation uses coverage matching. @param intrinsicCoverageMap: @param area: @param dependentAreas: @return: """ branches = len(dependentAreas) matchedBranches = 0 if area in intrinsicCoverageMap: intrinsicDependentCoverages = intrinsicCoverageMap[area] for dependentArea in dependentAreas: if dependentArea in intrinsicDependentCoverages: matchedBranches += 1 return branches, matchedBranches def taggingFunction(self, sense, phraseGroup): """ Generate static tag for given phrase group. @param sense: @type sense: SenseTree @param phraseDerivations: @param phraseGroup: @return: """ headPhrase = phraseGroup[0] minLevel = 999 for iPhrase in range(len(phraseGroup) - 1, -1, -1): phrase = phraseGroup[iPhrase] if phrase not in sense.mapNodeLevel: continue level = sense.mapNodeLevel[phrase] if level <= minLevel: minLevel = level headPhrase = phrase mainToken = sense.mapNodeToMainToken[headPhrase] return sense.tokens[mainToken - 1][0] def buildHypothesisesForArea(self, sense, hypStacks, area, areaTags, phraseBorders, phraseDerivations, intrinsicCoverageMap, phraseClosedTokens, intrinsicCoverageSourceMap): """ Build Hypothesis stacks in given area. @param hypStacks: hypothesis stack @param area: (low token id, high token id) @param phraseBorders: map node -> (left border, right border) @param phraseDerivations: list (child node, yielded node) @param phraseCoverages: map node -> (left coverage end, right coverage end) @param intrinsicRuleCoverages: map source string -> coverage @return: None """ basePhrases = self.enumerateBasePhrasesForArea(area, phraseBorders, phraseDerivations) # Assume all base phrases fullfill the area # print [phraseBorders[p] for p in basePhrases] # Decode in phrase CYK for phraseCount in range(1, len(basePhrases) + 1): for beginPosition in range(0, len(basePhrases) - phraseCount + 1): phraseGroup = basePhrases[beginPosition : beginPosition + phraseCount] area = (phraseBorders[phraseGroup[0]][0], phraseBorders[phraseGroup[-1]][1]) generatedSources = self.generateSources(phraseGroup, phraseClosedTokens, phraseBorders) finalHyps = [] intrinsicSource = None intrinsicSourceString = None triedIntrinsicSource = False if area in intrinsicCoverageSourceMap: intrinsicSource = intrinsicCoverageSourceMap[area] intrinsicSourceString = self.buildSourceString(sense, areaTags, intrinsicSource) generatedSources.append(intrinsicSource) # Fetch rules and decode # For all sources try exactly matching for source in generatedSources: dependentAreas = [p for p in source if isinstance(p, tuple)] # Check dependent hypothesises missingSupport = False for dependentArea in dependentAreas: if dependentArea not in hypStacks: missingSupport = True break if missingSupport: continue # Fetch rule sourceString = self.buildSourceString(sense, areaTags, source) if not sourceString: continue if sourceString == intrinsicSourceString: if triedIntrinsicSource: continue else: triedIntrinsicSource = True # Fetch exactly matched rule exactlyMatchedRules = self.rulefetcher.findRulesBySourceString(sourceString, dependentAreas) hyps = None # If this source is an intrinsic source # then try to reconstruct or build depraved rules if not exactlyMatchedRules and sourceString != intrinsicSourceString: continue subTreeDistance = self.getSubTreeDistance(intrinsicCoverageMap, area, dependentAreas) if not exactlyMatchedRules: # In this case, here we got a intrinsic rule covers same area in the parse tree. # We should not allow this rule to be kicked off, so use reconstruction or depraved glue rule. depravedReconstruction = len(source) > 12 # Need reconstruction reconstructor = Reconstructor(self.ruletable, self.model, sense, area, sourceString, subTreeDistance, hypStacks, source, areaTags, dependentAreas, depravedReconstruction) hyps = reconstructor.parse() else: # Got some rules, then using normal cube pruning to get hypothesis pruner = CubePruner(self.model, area, sourceString, subTreeDistance, exactlyMatchedRules, dependentAreas, hypStacks) hyps = pruner.prune() finalHyps.extend(hyps) if not finalHyps and area in intrinsicCoverageSourceMap: import pdb; pdb.set_trace() if finalHyps: hypStacks[area] = finalHyps[:setting.size_beam] areaTags[area] = self.taggingFunction(sense, phraseGroup)