Exemplo n.º 1
0
 def build_branch(self, rule):
     if isinstance(rule, UnaryRule):
         return Tree(rule.parent, [Tree(rule.child)])
     elif isinstance(rule, BinaryRule):
         return Tree(rule.parent,
                     [Tree(rule.left_child),
                      Tree(rule.right_child)])
Exemplo n.º 2
0
    def back_trace(self, back, label, i, j):
        #print label, i, j
        if not label in back[i][j]: # leaf
            if label[0] == "#":
                return Tree(label[1:])
            return Tree(label)

        pointer = back[i][j][label]
        children = []
        if type(pointer) == tuple and len(pointer) == 3:
            split, left, right = pointer
            left_child = self.back_trace(back, left, i, split)
            right_child = self.back_trace(back, right, split, j)
            children = [left_child, right_child]
            """
            if right[0] == "@":
                children = [left_child] + right_child.children
            else:
                children = [left_child, right_child]
            """
        elif type(pointer) == str:
            children = [self.back_trace(back, pointer, i, j)]
        else:
            raise ValueError, "Invalid Pointer"

        return Tree(label, children)
Exemplo n.º 3
0
    def annotate_tree(cls, unannotated_tree):
        """
        Currently, the only annotation done is a lossless binarization
        """

        # TODO: change the annotation from a lossless binarization to a
        # finite-order markov process (try at least 1st and 2nd order)
        # mark nodes with the label of their parent nodes, giving a second
        # order vertical markov process

        label = unannotated_tree.label
        children = unannotated_tree.children
        new_children = []
        if len(children) == 1:
            new_children.append(children[0])
        if len(children) == 2:
            for n in [0, 1]:
                other = (n + 1) % 2
                my_labels = re.split("\^", children[n].label)
                other_labels = re.split("\^", children[other].label)
                new_label = my_labels[0] + '^' + other_labels[0]
                new_children.append(
                    TreeAnnotations.annotate_tree(
                        Tree(new_label, child.children)))
        return Tree(label, new_children)
Exemplo n.º 4
0
 def buildTree(self,left_pos,right_pos,label):
     result = self.back[left_pos][right_pos][label]
     #print "Label:%s\tResult:%s" % (label,result)
     if isinstance(result,float):
         return Tree(label,[Tree(self.sentence[left_pos],[])])
     elif isinstance(result,tuple):
         return Tree(label,[self.buildTree(left_pos,result[0],result[1]),self.buildTree(result[0],right_pos,result[2])])
     else:
         return Tree(label,[self.buildTree(left_pos,right_pos,result)])
 def strip_leaves(self, tree):
     if tree.is_leaf():
         return None
     if tree.is_preterminal():
         return Tree(tree.label)
     children = []
     for child in tree.children:
         children.append(self.strip_leaves(child))
     return Tree(tree.label, children)
    def binarize_tree(cls, tree):
        label = tree.label
        if tree.is_leaf():
            return Tree(label)
        if len(tree.children) == 1:
            return Tree(label, [TreeAnnotations.binarize_tree(tree.children[0])])

        intermediate_label = "@%s->" % label
        intermediate_tree = TreeAnnotations.binarize_tree_helper(
                tree, 0, intermediate_label)
        return Tree(label, intermediate_tree.children)
Exemplo n.º 7
0
    def get_best_parse(self, sentence):
        """
        Should return a Tree.
        'sentence' is a list of strings (words) that form a sentence.
        """
        # TODO: implement this method
        nonterms = self.lexicon.get_all_tags()

        score = collections.defaultdict(lambda:collections.defaultdict(lambda:0))
        back  = collections.defaultdict(lambda:collections.defaultdict(lambda:[]))
        for i,w in enumerate(sentence):
            for A in nonterms:
                prob = self.lexicon.score_tagging(w,A)
                if prob > 0:
                    score[(i,i+1)][A] = prob

            added = True
            while added:
                added = False
                for j in range(len(nonterms)-1):
                    A = nonterms[j]
                    B = nonterms[j+1]
                    prob = self.lexicon.score_tagging(B,A) * score[(i,i+1)][B]
                    if prob > score[(i,i+1)][A]:
                        score[(i,i+1)][A] = prob
                        back[(i,i+1)][A] = B
                        added = True
        for span in range(2,len(sentence)):
            for begin in range(len(sentence)-span):
                end = begin + span
                for split in range(begin+1,end-1):
                    for i in range(len(nonterms)-2):
                        A = nonterms[i]
                        B = nonterms[i+1]
                        C = nonterms[i+2]


                        prob = score[(begin,end)][A] * score[(split,end)][C] * self.lexicon.score_tagging(A,B)
                        if prob > score[(begin,end)][A]:
                            score[(begin,end)][A] = prob
                            back[(begin,end)][A] = (split,B,C)
                added = True
                while added:
                    added = False
                    for j in range(len(nonterms)-1):
                        A = nonterms[j]
                        B = nonterms[j+1]
                        prob = self.lexicon.score_tagging(A,B) * score[(begin,end)][B]
                        if prob > score[(begin,end)][A]:
                            score[(begin,end)][A] = prob
                            back[(begin,end)][A] = B
                            added = True
        return Tree("ROOT",[Tree("Test",[])])
Exemplo n.º 8
0
    def binarize_tree(cls, tree):
        label = tree.label
        if tree.is_leaf():
            return Tree(label)
        if len(tree.children) <= 2:
            children = [
                TreeAnnotations.binarize_tree(child) for child in tree.children
            ]
            return Tree(label, children)

        intermediate_label = '@%s->' % label
        intermediate_tree = TreeAnnotations.binarize_tree_helper(
            tree, 0, intermediate_label)
        return Tree(label, intermediate_tree.children)
 def merge(self, left_tree, right_tree):
     span = len(left_tree.get_yield()) + len(right_tree.get_yield())
     maxval = max(self.span_to_categories[span].values())
     for key in self.span_to_categories[span]:
         if self.span_to_categories[span][key] == maxval:
             most_freq_label = key
             break
     return Tree(most_freq_label, [left_tree, right_tree])
Exemplo n.º 10
0
 def make_tree(begin, end, A, depth=0):
     s = '   ' * (depth)
     #print s, 'make_tree(%d, %d, "%s")' % (begin, end, str(A)),
     backptrs = back[begin][end][A]
     #print D(backptrs)
     tag = A.parent
     if not backptrs:
         #print s, '**', str(A), tag, A.child
         return Tree(tag, [Tree(A.child)])
     if len(backptrs) == 1:
         [B] = backptrs
         child = make_tree(begin, end, B, depth + 1)
         return Tree(tag, [child])
     elif len(backptrs) == 3:
         [split, B, C] = backptrs
         childB = make_tree(begin, split, B, depth + 1)
         childC = make_tree(split, end, C, depth + 1)
         return Tree(tag, [childB, childC])
    def binarize_tree(cls, tree, suffix):
        label = tree.label + suffix
        nextsuffix = "^%s" % tree.label
        if tree.is_leaf():
            return Tree(tree.label)
        if len(tree.children) == 1:
            return Tree(
                label,
                [TreeAnnotations.binarize_tree(tree.children[0], nextsuffix)])
        if len(tree.children) == 2:
            return Tree(label, [
                TreeAnnotations.binarize_tree(tree.children[0], nextsuffix),
                TreeAnnotations.binarize_tree(tree.children[1], nextsuffix)
            ])

        intermediate_label = "@%s->" % label
        intermediate_tree = TreeAnnotations.binarize_tree_helper(
            tree, 0, intermediate_label)
        return Tree(label, intermediate_tree.children)
Exemplo n.º 12
0
 def binarize_tree_helper(cls, tree, num_children_generated,
                          intermediate_label):
     left_tree = tree.children[num_children_generated]
     children = []
     children.append(TreeBinarization.binarize_tree(left_tree))
     if num_children_generated < len(tree.children) - 1:
         right_tree = TreeBinarization.binarize_tree_helper(
             tree, num_children_generated + 1,
             intermediate_label + "_" + left_tree.label)
         children.append(right_tree)
     return Tree(intermediate_label, children)
Exemplo n.º 13
0
 def build(self, rule):
     origin = rule.origin
     if origin == (None, None):
         print str(rule)
         return self.build_branch(rule)
     else:
         if isinstance(rule, UnaryRule):
             # print str(rule)
             tag = rule.child
             next_rule = self.search_rule(origin, tag)
             return Tree(rule.parent, [self.build(next_rule)])
         
         elif isinstance(rule, BinaryRule):
             # print str(rule)
             left_origin, right_origin = origin[0], origin[1]
             left_tag, right_tag = rule.left_child, rule.right_child
             
             next_left_rule = self.search_rule(left_origin, left_tag)
             next_right_rule = self.search_rule(right_origin, right_tag)
             
             return Tree(rule.parent, [self.build(next_left_rule), self.build(next_right_rule)])
Exemplo n.º 14
0
    def binarize(self, tree):
        if tree.is_leaf():
            return

        if len(tree.children) > 2:
            new_label = "@%s_%s" % (tree.label, tree. children[0].label)
            new_children = tree.children[1:]
            new_subtree = Tree(new_label, new_children)
            tree.children = [tree.children[0], new_subtree]

        for child in tree.children:
            self.binarize(child)
    def binarize_tree_helper(cls, tree, num_children_generated,
                             intermediate_label):
        left_tree = tree.children[num_children_generated]
        children = []
        nextsuffix = "^%s" % tree.label
        children.append(TreeAnnotations.binarize_tree(left_tree, nextsuffix))
        if num_children_generated < len(tree.children) - 2:
            right_tree = TreeAnnotations.binarize_tree_helper(
                tree, num_children_generated + 1,
                intermediate_label + "_" + left_tree.label)
            children.append(right_tree)
        else:
            right_tree = left_tree = tree.children[num_children_generated + 1]
            children.append(
                TreeAnnotations.binarize_tree(right_tree, nextsuffix))

        return Tree(intermediate_label, children)
 def build_tag_tree(self, words, tags, cur_position):
     leaf_tree = Tree(words[cur_position])
     tag_tree = Tree(tags[cur_position], [leaf_tree])
     return tag_tree
 def add_root(self, tree):
     return Tree("ROOT", [tree])
Exemplo n.º 18
0
 def get_best_parse(self, sentence):
     """
     Should return a Tree.
     'sentence' is a list of strings (words) that form a sentence.
     """
     # TODO: implement this method
     score = collections.defaultdict(lambda: \
             collections.defaultdict(lambda: \
             collections.defaultdict(lambda: 0.0)))
     back = collections.defaultdict(lambda: \
             collections.defaultdict(lambda: \
             collections.defaultdict(lambda: None)))
     tags = self.lexicon.get_all_tags()
     wordN = len(sentence)
     i = 0
     for word in sentence:
         iplus = i + 1
         for tag in tags:
             prob = self.lexicon.score_tagging(word, tag)
             if prob > score[i][iplus][tag]:
                 score[i][iplus][tag] = prob
                 word_tree = Tree(word, [])
                 tag_tree = Tree(tag, [word_tree])
                 back[i][iplus][tag] = tag_tree
         added = True
         while added:
             added = False
             for child in self.grammar.unary_rules_by_child:
                 for rule in self.grammar.get_unary_rules_by_child(child):
                     prob = score[i][iplus][child] * rule.score
                     parent = rule.parent
                     if prob > score[i][iplus][parent]:
                         score[i][iplus][parent] = prob
                         child_tree = back[i][iplus][child]
                         back[i][iplus][parent] = Tree(parent, [child_tree])
                         added = True
         i = i + 1
     for span in range(2, wordN + 1):
         for begin in range(0, wordN + 1 - span):
             end = begin + span
             for split in range(begin + 1, end):
                 for left_child in self.grammar.binary_rules_by_left_child:
                     for rule in self.grammar.get_binary_rules_by_left_child(
                             left_child):
                         right_child = rule.right_child
                         parent = rule.parent
                         prob = score[begin][split][left_child] *\
                                score[split][end][right_child] *\
                                 rule.score
                         if prob > score[begin][end][parent]:
                             score[begin][end][parent] = prob
                             left_tree = back[begin][split][left_child]
                             right_tree = back[split][end][right_child]
                             back[begin][end][parent] = \
                                     Tree(parent, [left_tree,right_tree])
                         added = True
                         while added:
                             added = False
                             if self.grammar.get_unary_rules_by_child(
                                     parent):
                                 rules = self.grammar.get_unary_rules_by_child(
                                     parent)
                                 for rule in rules:
                                     prob = score[begin][end][
                                         parent] * rule.score
                                     newparent = rule.parent
                                     if prob > score[begin][end][newparent]:
                                         score[begin][end][newparent] = prob
                                         tree = back[begin][end][parent]
                                         back[begin][end][newparent] = Tree(
                                             newparent, [tree])
                                         added = True
     return TreeAnnotations.unannotate_tree(back[0][wordN]['ROOT'])
 def tree(self):
     return Tree(self.label, [Tree(self.word)])
 def tree(self):
     return Tree(self.label, [self.left.tree(), self.right.tree()])
 def tree(self):
     return Tree(self.label, [self.parent.tree()])
Exemplo n.º 22
0
 def add_root(self, tree):
     return Tree('ROOT', [tree])
Exemplo n.º 23
0
    def get_best_parse(self, sentence):
        """
        Should return a Tree.
        'sentence' is a list of strings (words) that form a sentence.
        """
        # TODO: implement this method

        # initialize
        length = len(sentence)
        score = []
        for i in xrange(length+1):
            score.append([])
            for j in xrange(length+1):
                score[i].append(collections.defaultdict(float))
        back = []
        for i in xrange(length+1):
            back.append([])
            for j in xrange(length+1):
                back[i].append(collections.defaultdict())

        # process terminal symbols
        for i in xrange(length):
            word = sentence[i]
            for tag in self.lexicon.get_all_tags():
                score[i][i+1][tag] = self.lexicon.score_tagging(word, tag)
                back[i][i+1][tag] = "#" + word

                # handle unaries
                added = True
                while added:
                    added = False
                    for tag in score[i][i+1].keys():
                        for unary_rule in self.grammar.get_unary_rules_by_child(tag):
                            prob = unary_rule.score * score[i][i+1][tag]
                            if prob != 0 and prob > score[i][i+1][unary_rule.parent]:
                                score[i][i+1][unary_rule.parent] = prob
                                back[i][i+1][unary_rule.parent] = tag
                                added = True

        # process nonterminal symbols
        for span in xrange(2, length+1):
            for begin in xrange(length+1-span):
                end = begin + span
                for split in xrange(begin + 1, end):
                    left = score[begin][split]
                    right = score[split][end]
                    for left_tag in left:
                        for binary_rule in self.grammar.get_binary_rules_by_left_child(left_tag):
                            if binary_rule.right_child in right:
                                prob = left[left_tag] * right[binary_rule.right_child] * binary_rule.score
                                if prob > score[begin][end][binary_rule.parent]:
                                    score[begin][end][binary_rule.parent] = prob
                                    back[begin][end][binary_rule.parent] = (split, left_tag, binary_rule.right_child)

                # handle unaries
                added = True
                while added:
                    added = False
                    for tag in score[begin][end].keys():
                        for unary_rule in self.grammar.get_unary_rules_by_child(tag):
                            prob = unary_rule.score * score[begin][end][tag]
                            if prob != 0 and prob > score[begin][end][unary_rule.parent]:
                                score[begin][end][unary_rule.parent] = prob
                                back[begin][end][unary_rule.parent] = tag
                                added = True

        """
        # debug
        print sentence
        print "score:"
        for i in xrange(len(score)):
            for j in xrange(len(score[i])):
                print "(%d %d):" % (i, j), ["%s : %.02f" % (key, value) for key,value in score[i][j].items()], 
            print 
        print "back:"
        for i in xrange(len(back)):
            print i,
            for j in xrange(len(back[i])):
                print dict(back[i][j]),
            print
        """

        if not "ROOT" in back[0][length]:
            return Tree("", [Tree("ROOT")])
        result = self.back_trace(back, "ROOT", 0, length)
        return TreeAnnotations.unannotate_tree(result)