예제 #1
0
    def train(self, graphs):
        """
        Trains a ProbabilisticDependencyGrammar based on the list of input
        DependencyGraphs.  This model is an implementation of Eisner's (1996)
        Model C, which derives its statistics from head-word, head-tag,
        child-word, and child-tag relationships.

        :param graphs: A list of dependency graphs to train from.
        :type: list(DependencyGraph)
        """
        productions = []
        events = defaultdict(int)
        tags = {}
        for dg in graphs:
            for node_index in range(1, len(dg.nodes)):
                children = dg.nodes[node_index]['deps']
                nr_left_children = dg.left_children(node_index)
                nr_right_children = dg.right_children(node_index)
                nr_children = nr_left_children + nr_right_children
                for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2):
                    head_word = dg.nodes[node_index]['word']
                    head_tag = dg.nodes[node_index]['tag']
                    if head_word in tags:
                        tags[head_word].add(head_tag)
                    else:
                        tags[head_word] = set([head_tag])
                    child = 'STOP'
                    child_tag = 'STOP'
                    prev_word = 'START'
                    prev_tag = 'START'
                    if child_index < 0:
                        array_index = child_index + nr_left_children
                        if array_index >= 0:
                            child = dg.nodes[children[array_index]]['word']
                            child_tag = dg.nodes[children[array_index]]['tag']
                        if child_index != -1:
                            prev_word = dg.nodes[children[array_index + 1]]['word']
                            prev_tag = dg.nodes[children[array_index + 1]]['tag']
                        if child != 'STOP':
                            productions.append(DependencyProduction(head_word, [child]))
                        head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag)
                        mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag)
                        events[head_event] += 1
                        events[mod_event] += 1
                    elif child_index > 0:
                        array_index = child_index + nr_left_children - 1
                        if array_index < nr_children:
                            child = dg.nodes[children[array_index]]['word']
                            child_tag = dg.nodes[children[array_index]]['tag']
                        if child_index != 1:
                            prev_word = dg.nodes[children[array_index - 1]]['word']
                            prev_tag =  dg.nodes[children[array_index - 1]]['tag']
                        if child != 'STOP':
                            productions.append(DependencyProduction(head_word, [child]))
                        head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag)
                        mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag)
                        events[head_event] += 1
                        events[mod_event] += 1
        self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)
예제 #2
0
    def train(self, graphs):
        """
        Trains a ProbabilisticDependencyGrammar based on the list of input
        DependencyGraphs.  This model is an implementation of Eisner's (1996)
        Model C, which derives its statistics from head-word, head-tag,
        child-word, and child-tag relationships.

        :param graphs: A list of dependency graphs to train from.
        :type: list(DependencyGraph)
        """
        productions = []
        events = defaultdict(int)
        tags = {}
        for dg in graphs:
            for node_index in range(1, len(dg.nodes)):
                # children = dg.nodes[node_index]['deps']
                children = list(
                    chain.from_iterable(dg.nodes[node_index]["deps"].values())
                )

                nr_left_children = dg.left_children(node_index)
                nr_right_children = dg.right_children(node_index)
                nr_children = nr_left_children + nr_right_children
                for child_index in range(
                    0 - (nr_left_children + 1), nr_right_children + 2
                ):
                    head_word = dg.nodes[node_index]["word"]
                    head_tag = dg.nodes[node_index]["tag"]
                    if head_word in tags:
                        tags[head_word].add(head_tag)
                    else:
                        tags[head_word] = {head_tag}
                    child = "STOP"
                    child_tag = "STOP"
                    prev_word = "START"
                    prev_tag = "START"
                    if child_index < 0:
                        array_index = child_index + nr_left_children
                        if array_index >= 0:
                            child = dg.nodes[children[array_index]]["word"]
                            child_tag = dg.nodes[children[array_index]]["tag"]
                        if child_index != -1:
                            prev_word = dg.nodes[children[array_index + 1]]["word"]
                            prev_tag = dg.nodes[children[array_index + 1]]["tag"]
                        if child != "STOP":
                            productions.append(DependencyProduction(head_word, [child]))
                        head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format(
                            child,
                            child_tag,
                            prev_tag,
                            head_word,
                            head_tag,
                        )
                        mod_event = "(mods ({}, {}, {}) left))".format(
                            prev_tag,
                            head_word,
                            head_tag,
                        )
                        events[head_event] += 1
                        events[mod_event] += 1
                    elif child_index > 0:
                        array_index = child_index + nr_left_children - 1
                        if array_index < nr_children:
                            child = dg.nodes[children[array_index]]["word"]
                            child_tag = dg.nodes[children[array_index]]["tag"]
                        if child_index != 1:
                            prev_word = dg.nodes[children[array_index - 1]]["word"]
                            prev_tag = dg.nodes[children[array_index - 1]]["tag"]
                        if child != "STOP":
                            productions.append(DependencyProduction(head_word, [child]))
                        head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format(
                            child,
                            child_tag,
                            prev_tag,
                            head_word,
                            head_tag,
                        )
                        mod_event = "(mods ({}, {}, {}) right))".format(
                            prev_tag,
                            head_word,
                            head_tag,
                        )
                        events[head_event] += 1
                        events[mod_event] += 1
        self._grammar = ProbabilisticDependencyGrammar(productions, events, tags)