Exemplo n.º 1
0
    def fix_remnants_in_tree(self, root):
        """Change ellipsis with remnant deprels to UDv2 ellipsis with orphans.

        Remnant's parent is always the correlate (same-role) node.
        Usually, correlate's parent is the head of the whole ellipsis subtree,
        i.e. the first conjunct. However, sometimes remnants are deeper, e.g.
        'Over 300 Iraqis are reported dead and 500 wounded.' with edges:
         nsubjpass(reported, Iraqis)
         nummod(Iraqis, 300)
         remnant(300, 500)
        Let's expect all remnants in one tree are part of the same ellipsis structure.
        TODO: theoretically, there may be more ellipsis structures with remnants in one tree,
              but I have no idea how to distinguish them from the deeper-remnants cases.
        """
        remnants = [n for n in root.descendants if n.deprel == 'remnant']
        if not remnants:
            return

        (first_conjunct,
         _) = find_minimal_common_treelet(remnants[0].parent.parent, *remnants)
        if first_conjunct == root:
            self.log(remnants[0], 'remnant',
                     "remnants' (+their grandpas') common governor is root")
            return

        # top_remnants = remnants with non-remnant parent,
        # other (so-called "chained") remnants will be solved recursively.
        top_remnants = [n for n in remnants if n.parent.deprel != 'remnant']
        top_remnants.sort(
            key=lambda n: self.HEAD_PROMOTION.get(n.parent.deprel, 0))
        deprels = [n.parent.deprel for n in top_remnants]
        self._recursive_fix_remnants(top_remnants, deprels, first_conjunct)
Exemplo n.º 2
0
    def fix_remnants_in_tree(self, root):
        """Change ellipsis with remnant deprels to UDv2 ellipsis with orphans.

        Remnant's parent is always the correlate (same-role) node.
        Usually, correlate's parent is the head of the whole ellipsis subtree,
        i.e. the first conjunct. However, sometimes remnants are deeper, e.g.
        'Over 300 Iraqis are reported dead and 500 wounded.' with edges::

          nsubjpass(reported, Iraqis)
          nummod(Iraqis, 300)
          remnant(300, 500)

        Let's expect all remnants in one tree are part of the same ellipsis structure.

        TODO: theoretically, there may be more ellipsis structures with remnants in one tree,
        but I have no idea how to distinguish them from the deeper-remnants cases.
        """
        remnants = [n for n in root.descendants if n.deprel == 'remnant']
        if not remnants:
            return

        (first_conjunct, _) = find_minimal_common_treelet(remnants[0].parent.parent, *remnants)
        if first_conjunct == root:
            self.log(remnants[0], 'remnant', "remnants' (+their grandpas') common governor is root")
            return

        # top_remnants = remnants with non-remnant parent,
        # other (so-called "chained") remnants will be solved recursively.
        top_remnants = [n for n in remnants if n.parent.deprel != 'remnant']
        top_remnants.sort(key=lambda n: self.HEAD_PROMOTION.get(n.parent.deprel, 0))
        deprels = [n.parent.deprel for n in top_remnants]
        self._recursive_fix_remnants(top_remnants, deprels, first_conjunct)
Exemplo n.º 3
0
    def test_topology(self):
        """Test methods/properties descendants, children, prev_node, next_node, ord."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data',
                                     'enh_deps.conllu')
        doc.load_conllu(data_filename)
        self.assertEqual(len(doc.bundles), 1)
        root = doc.bundles[0].get_tree()
        nodes = root.descendants
        nodes2 = root.descendants()
        # descendants() and descendants should return the same sequence of nodes
        self.assertEqual(nodes, nodes2)
        self.assertEqual(len(nodes), 6)
        self.assertEqual(nodes[1].parent, root)
        self.assertEqual(nodes[2].root, root)
        self.assertEqual(len(nodes[1].descendants), 5)
        self.assertEqual(len(nodes[1].children), 3)
        self.assertEqual(len(nodes[1].children(add_self=True)), 4)
        self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)),
                         3)

        self.assertEqual(nodes[0].next_node, nodes[1])
        self.assertEqual(nodes[2].prev_node, nodes[1])
        self.assertEqual(nodes[5].next_node, None)
        self.assertEqual(root.prev_node, None)

        (common_ancestor,
         added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1])
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [])
        input_nodes = [nodes[2], nodes[4], nodes[5]]
        (common_ancestor,
         added_nodes) = find_minimal_common_treelet(*input_nodes)
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [nodes[1], nodes[3]])

        # ords and reorderings
        self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6])
        self.assertTrue(nodes[0].precedes(nodes[1]))
        self.assertTrue(nodes[0] < nodes[1])
        self.assertFalse(nodes[0] > nodes[1])
        self.assertTrue(nodes[0] <= nodes[0])
        nodes[0].shift_after_node(nodes[1])
        self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6])
        self.assertEqual([node.ord for node in root.descendants()],
                         [1, 2, 3, 4, 5, 6])
Exemplo n.º 4
0
    def find_head(self, mention):
        mwords = set(mention.words)

        # First, check the simplest case: no empty words and a treelet in basic dependencies.
        basic_heads = [w for w in mention.words if not w.parent or not w.parent in mwords]
        assert basic_heads
        if len(basic_heads) == 1:
            return basic_heads[0], 'treelet'

        # Second, check also enhanced dependencies (but only within basic_heads for simplicity).
        enh_heads = [w for w in basic_heads if not any(p in mwords for p in self._eparents(w))]
        if not enh_heads:
            enh_heads = [w for w in basic_heads if not all(p in mwords for p in self._eparents(w))]
            if not enh_heads:
                return mention.head, 'cycle'
        if len(enh_heads) == 1:
            return enh_heads[0], 'treelet'

        # Third, find non-empty parents (ancestors in future) of empty nodes.
        empty_nodes, non_empty = [], []
        for w in enh_heads:
            (empty_nodes if w.is_empty() else non_empty).append(w)
        if empty_nodes:
            for empty_node in empty_nodes:
                parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()]
                if parents:
                    if parents[0] not in non_empty:
                        non_empty.append(parents[0])
                else:
                    # TODO we should climb up, but preventing cycles
                    # We could also introduce empty_node.nonempty_ancestor
                    if 'warn' in self.bugs:
                        logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}")
                    if 'mark' in self.bugs:
                        node.misc['Bug'] = 'no-parent-of-empty'
            non_empty.sort()

        # Fourth, check if there is a node within the enh_heads governing all the mention nodes
        # and forming thus a "gappy treelet", where the head is clearly the "highest" node.
        (highest, added_nodes) = find_minimal_common_treelet(*non_empty)
        if highest in enh_heads:
            return highest, 'gappy'
        if highest in mwords:
            if 'warn' in self.bugs:
                logging.warning(f"Strange mention {mention.head} with highest node {highest}")
            if 'mark' in self.bugs:
                highest.misc['Bug'] = 'highest-in-mwords'
                mention.head.misc['Bug'] = 'highest-head'

        # Fifth, try to convervatively preserve the original head, if it is one of the possible heads.
        if mention.head in enh_heads:
            return mention.head, 'nontreelet'

        # Finally, return the word-order-wise first head candidate as the head.
        return enh_heads[0], 'nontreelet'
Exemplo n.º 5
0
    def test_topology(self):
        """Test methods/properties descendants, children, prev_node, next_node, ord."""
        doc = Document()
        data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu')
        doc.load_conllu(data_filename)
        self.assertEqual(len(doc.bundles), 1)
        root = doc.bundles[0].get_tree()
        nodes = root.descendants
        nodes2 = root.descendants()
        # descendants() and descendants should return the same sequence of nodes
        self.assertEqual(nodes, nodes2)
        self.assertEqual(len(nodes), 6)
        self.assertEqual(nodes[1].parent, root)
        self.assertEqual(nodes[2].root, root)
        self.assertEqual(len(nodes[1].descendants), 5)
        self.assertEqual(len(nodes[1].children), 3)
        self.assertEqual(len(nodes[1].children(add_self=True)), 4)
        self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3)

        self.assertEqual(nodes[0].next_node, nodes[1])
        self.assertEqual(nodes[2].prev_node, nodes[1])
        self.assertEqual(nodes[5].next_node, None)
        self.assertEqual(root.prev_node, None)

        (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1])
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [])
        input_nodes = [nodes[2], nodes[4], nodes[5]]
        (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes)
        self.assertEqual(common_ancestor, nodes[1])
        self.assertEqual(list(added_nodes), [nodes[1], nodes[3]])

        # ords and reorderings
        self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6])
        nodes[0].shift_after_node(nodes[1])
        self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6])
        self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])