def build_branch(self, rule): if isinstance(rule, UnaryRule): return Tree(rule.parent, [Tree(rule.child)]) elif isinstance(rule, BinaryRule): return Tree(rule.parent, [Tree(rule.left_child), Tree(rule.right_child)])
def back_trace(self, back, label, i, j): #print label, i, j if not label in back[i][j]: # leaf if label[0] == "#": return Tree(label[1:]) return Tree(label) pointer = back[i][j][label] children = [] if type(pointer) == tuple and len(pointer) == 3: split, left, right = pointer left_child = self.back_trace(back, left, i, split) right_child = self.back_trace(back, right, split, j) children = [left_child, right_child] """ if right[0] == "@": children = [left_child] + right_child.children else: children = [left_child, right_child] """ elif type(pointer) == str: children = [self.back_trace(back, pointer, i, j)] else: raise ValueError, "Invalid Pointer" return Tree(label, children)
def annotate_tree(cls, unannotated_tree): """ Currently, the only annotation done is a lossless binarization """ # TODO: change the annotation from a lossless binarization to a # finite-order markov process (try at least 1st and 2nd order) # mark nodes with the label of their parent nodes, giving a second # order vertical markov process label = unannotated_tree.label children = unannotated_tree.children new_children = [] if len(children) == 1: new_children.append(children[0]) if len(children) == 2: for n in [0, 1]: other = (n + 1) % 2 my_labels = re.split("\^", children[n].label) other_labels = re.split("\^", children[other].label) new_label = my_labels[0] + '^' + other_labels[0] new_children.append( TreeAnnotations.annotate_tree( Tree(new_label, child.children))) return Tree(label, new_children)
def buildTree(self,left_pos,right_pos,label): result = self.back[left_pos][right_pos][label] #print "Label:%s\tResult:%s" % (label,result) if isinstance(result,float): return Tree(label,[Tree(self.sentence[left_pos],[])]) elif isinstance(result,tuple): return Tree(label,[self.buildTree(left_pos,result[0],result[1]),self.buildTree(result[0],right_pos,result[2])]) else: return Tree(label,[self.buildTree(left_pos,right_pos,result)])
def strip_leaves(self, tree): if tree.is_leaf(): return None if tree.is_preterminal(): return Tree(tree.label) children = [] for child in tree.children: children.append(self.strip_leaves(child)) return Tree(tree.label, children)
def binarize_tree(cls, tree): label = tree.label if tree.is_leaf(): return Tree(label) if len(tree.children) == 1: return Tree(label, [TreeAnnotations.binarize_tree(tree.children[0])]) intermediate_label = "@%s->" % label intermediate_tree = TreeAnnotations.binarize_tree_helper( tree, 0, intermediate_label) return Tree(label, intermediate_tree.children)
def get_best_parse(self, sentence): """ Should return a Tree. 'sentence' is a list of strings (words) that form a sentence. """ # TODO: implement this method nonterms = self.lexicon.get_all_tags() score = collections.defaultdict(lambda:collections.defaultdict(lambda:0)) back = collections.defaultdict(lambda:collections.defaultdict(lambda:[])) for i,w in enumerate(sentence): for A in nonterms: prob = self.lexicon.score_tagging(w,A) if prob > 0: score[(i,i+1)][A] = prob added = True while added: added = False for j in range(len(nonterms)-1): A = nonterms[j] B = nonterms[j+1] prob = self.lexicon.score_tagging(B,A) * score[(i,i+1)][B] if prob > score[(i,i+1)][A]: score[(i,i+1)][A] = prob back[(i,i+1)][A] = B added = True for span in range(2,len(sentence)): for begin in range(len(sentence)-span): end = begin + span for split in range(begin+1,end-1): for i in range(len(nonterms)-2): A = nonterms[i] B = nonterms[i+1] C = nonterms[i+2] prob = score[(begin,end)][A] * score[(split,end)][C] * self.lexicon.score_tagging(A,B) if prob > score[(begin,end)][A]: score[(begin,end)][A] = prob back[(begin,end)][A] = (split,B,C) added = True while added: added = False for j in range(len(nonterms)-1): A = nonterms[j] B = nonterms[j+1] prob = self.lexicon.score_tagging(A,B) * score[(begin,end)][B] if prob > score[(begin,end)][A]: score[(begin,end)][A] = prob back[(begin,end)][A] = B added = True return Tree("ROOT",[Tree("Test",[])])
def binarize_tree(cls, tree): label = tree.label if tree.is_leaf(): return Tree(label) if len(tree.children) <= 2: children = [ TreeAnnotations.binarize_tree(child) for child in tree.children ] return Tree(label, children) intermediate_label = '@%s->' % label intermediate_tree = TreeAnnotations.binarize_tree_helper( tree, 0, intermediate_label) return Tree(label, intermediate_tree.children)
def merge(self, left_tree, right_tree): span = len(left_tree.get_yield()) + len(right_tree.get_yield()) maxval = max(self.span_to_categories[span].values()) for key in self.span_to_categories[span]: if self.span_to_categories[span][key] == maxval: most_freq_label = key break return Tree(most_freq_label, [left_tree, right_tree])
def make_tree(begin, end, A, depth=0): s = ' ' * (depth) #print s, 'make_tree(%d, %d, "%s")' % (begin, end, str(A)), backptrs = back[begin][end][A] #print D(backptrs) tag = A.parent if not backptrs: #print s, '**', str(A), tag, A.child return Tree(tag, [Tree(A.child)]) if len(backptrs) == 1: [B] = backptrs child = make_tree(begin, end, B, depth + 1) return Tree(tag, [child]) elif len(backptrs) == 3: [split, B, C] = backptrs childB = make_tree(begin, split, B, depth + 1) childC = make_tree(split, end, C, depth + 1) return Tree(tag, [childB, childC])
def binarize_tree(cls, tree, suffix): label = tree.label + suffix nextsuffix = "^%s" % tree.label if tree.is_leaf(): return Tree(tree.label) if len(tree.children) == 1: return Tree( label, [TreeAnnotations.binarize_tree(tree.children[0], nextsuffix)]) if len(tree.children) == 2: return Tree(label, [ TreeAnnotations.binarize_tree(tree.children[0], nextsuffix), TreeAnnotations.binarize_tree(tree.children[1], nextsuffix) ]) intermediate_label = "@%s->" % label intermediate_tree = TreeAnnotations.binarize_tree_helper( tree, 0, intermediate_label) return Tree(label, intermediate_tree.children)
def binarize_tree_helper(cls, tree, num_children_generated, intermediate_label): left_tree = tree.children[num_children_generated] children = [] children.append(TreeBinarization.binarize_tree(left_tree)) if num_children_generated < len(tree.children) - 1: right_tree = TreeBinarization.binarize_tree_helper( tree, num_children_generated + 1, intermediate_label + "_" + left_tree.label) children.append(right_tree) return Tree(intermediate_label, children)
def build(self, rule): origin = rule.origin if origin == (None, None): print str(rule) return self.build_branch(rule) else: if isinstance(rule, UnaryRule): # print str(rule) tag = rule.child next_rule = self.search_rule(origin, tag) return Tree(rule.parent, [self.build(next_rule)]) elif isinstance(rule, BinaryRule): # print str(rule) left_origin, right_origin = origin[0], origin[1] left_tag, right_tag = rule.left_child, rule.right_child next_left_rule = self.search_rule(left_origin, left_tag) next_right_rule = self.search_rule(right_origin, right_tag) return Tree(rule.parent, [self.build(next_left_rule), self.build(next_right_rule)])
def binarize(self, tree): if tree.is_leaf(): return if len(tree.children) > 2: new_label = "@%s_%s" % (tree.label, tree. children[0].label) new_children = tree.children[1:] new_subtree = Tree(new_label, new_children) tree.children = [tree.children[0], new_subtree] for child in tree.children: self.binarize(child)
def binarize_tree_helper(cls, tree, num_children_generated, intermediate_label): left_tree = tree.children[num_children_generated] children = [] nextsuffix = "^%s" % tree.label children.append(TreeAnnotations.binarize_tree(left_tree, nextsuffix)) if num_children_generated < len(tree.children) - 2: right_tree = TreeAnnotations.binarize_tree_helper( tree, num_children_generated + 1, intermediate_label + "_" + left_tree.label) children.append(right_tree) else: right_tree = left_tree = tree.children[num_children_generated + 1] children.append( TreeAnnotations.binarize_tree(right_tree, nextsuffix)) return Tree(intermediate_label, children)
def build_tag_tree(self, words, tags, cur_position): leaf_tree = Tree(words[cur_position]) tag_tree = Tree(tags[cur_position], [leaf_tree]) return tag_tree
def add_root(self, tree): return Tree("ROOT", [tree])
def get_best_parse(self, sentence): """ Should return a Tree. 'sentence' is a list of strings (words) that form a sentence. """ # TODO: implement this method score = collections.defaultdict(lambda: \ collections.defaultdict(lambda: \ collections.defaultdict(lambda: 0.0))) back = collections.defaultdict(lambda: \ collections.defaultdict(lambda: \ collections.defaultdict(lambda: None))) tags = self.lexicon.get_all_tags() wordN = len(sentence) i = 0 for word in sentence: iplus = i + 1 for tag in tags: prob = self.lexicon.score_tagging(word, tag) if prob > score[i][iplus][tag]: score[i][iplus][tag] = prob word_tree = Tree(word, []) tag_tree = Tree(tag, [word_tree]) back[i][iplus][tag] = tag_tree added = True while added: added = False for child in self.grammar.unary_rules_by_child: for rule in self.grammar.get_unary_rules_by_child(child): prob = score[i][iplus][child] * rule.score parent = rule.parent if prob > score[i][iplus][parent]: score[i][iplus][parent] = prob child_tree = back[i][iplus][child] back[i][iplus][parent] = Tree(parent, [child_tree]) added = True i = i + 1 for span in range(2, wordN + 1): for begin in range(0, wordN + 1 - span): end = begin + span for split in range(begin + 1, end): for left_child in self.grammar.binary_rules_by_left_child: for rule in self.grammar.get_binary_rules_by_left_child( left_child): right_child = rule.right_child parent = rule.parent prob = score[begin][split][left_child] *\ score[split][end][right_child] *\ rule.score if prob > score[begin][end][parent]: score[begin][end][parent] = prob left_tree = back[begin][split][left_child] right_tree = back[split][end][right_child] back[begin][end][parent] = \ Tree(parent, [left_tree,right_tree]) added = True while added: added = False if self.grammar.get_unary_rules_by_child( parent): rules = self.grammar.get_unary_rules_by_child( parent) for rule in rules: prob = score[begin][end][ parent] * rule.score newparent = rule.parent if prob > score[begin][end][newparent]: score[begin][end][newparent] = prob tree = back[begin][end][parent] back[begin][end][newparent] = Tree( newparent, [tree]) added = True return TreeAnnotations.unannotate_tree(back[0][wordN]['ROOT'])
def tree(self): return Tree(self.label, [Tree(self.word)])
def tree(self): return Tree(self.label, [self.left.tree(), self.right.tree()])
def tree(self): return Tree(self.label, [self.parent.tree()])
def add_root(self, tree): return Tree('ROOT', [tree])
def get_best_parse(self, sentence): """ Should return a Tree. 'sentence' is a list of strings (words) that form a sentence. """ # TODO: implement this method # initialize length = len(sentence) score = [] for i in xrange(length+1): score.append([]) for j in xrange(length+1): score[i].append(collections.defaultdict(float)) back = [] for i in xrange(length+1): back.append([]) for j in xrange(length+1): back[i].append(collections.defaultdict()) # process terminal symbols for i in xrange(length): word = sentence[i] for tag in self.lexicon.get_all_tags(): score[i][i+1][tag] = self.lexicon.score_tagging(word, tag) back[i][i+1][tag] = "#" + word # handle unaries added = True while added: added = False for tag in score[i][i+1].keys(): for unary_rule in self.grammar.get_unary_rules_by_child(tag): prob = unary_rule.score * score[i][i+1][tag] if prob != 0 and prob > score[i][i+1][unary_rule.parent]: score[i][i+1][unary_rule.parent] = prob back[i][i+1][unary_rule.parent] = tag added = True # process nonterminal symbols for span in xrange(2, length+1): for begin in xrange(length+1-span): end = begin + span for split in xrange(begin + 1, end): left = score[begin][split] right = score[split][end] for left_tag in left: for binary_rule in self.grammar.get_binary_rules_by_left_child(left_tag): if binary_rule.right_child in right: prob = left[left_tag] * right[binary_rule.right_child] * binary_rule.score if prob > score[begin][end][binary_rule.parent]: score[begin][end][binary_rule.parent] = prob back[begin][end][binary_rule.parent] = (split, left_tag, binary_rule.right_child) # handle unaries added = True while added: added = False for tag in score[begin][end].keys(): for unary_rule in self.grammar.get_unary_rules_by_child(tag): prob = unary_rule.score * score[begin][end][tag] if prob != 0 and prob > score[begin][end][unary_rule.parent]: score[begin][end][unary_rule.parent] = prob back[begin][end][unary_rule.parent] = tag added = True """ # debug print sentence print "score:" for i in xrange(len(score)): for j in xrange(len(score[i])): print "(%d %d):" % (i, j), ["%s : %.02f" % (key, value) for key,value in score[i][j].items()], print print "back:" for i in xrange(len(back)): print i, for j in xrange(len(back[i])): print dict(back[i][j]), print """ if not "ROOT" in back[0][length]: return Tree("", [Tree("ROOT")]) result = self.back_trace(back, "ROOT", 0, length) return TreeAnnotations.unannotate_tree(result)