def fix_remnants_in_tree(self, root): """Change ellipsis with remnant deprels to UDv2 ellipsis with orphans. Remnant's parent is always the correlate (same-role) node. Usually, correlate's parent is the head of the whole ellipsis subtree, i.e. the first conjunct. However, sometimes remnants are deeper, e.g. 'Over 300 Iraqis are reported dead and 500 wounded.' with edges: nsubjpass(reported, Iraqis) nummod(Iraqis, 300) remnant(300, 500) Let's expect all remnants in one tree are part of the same ellipsis structure. TODO: theoretically, there may be more ellipsis structures with remnants in one tree, but I have no idea how to distinguish them from the deeper-remnants cases. """ remnants = [n for n in root.descendants if n.deprel == 'remnant'] if not remnants: return (first_conjunct, _) = find_minimal_common_treelet(remnants[0].parent.parent, *remnants) if first_conjunct == root: self.log(remnants[0], 'remnant', "remnants' (+their grandpas') common governor is root") return # top_remnants = remnants with non-remnant parent, # other (so-called "chained") remnants will be solved recursively. top_remnants = [n for n in remnants if n.parent.deprel != 'remnant'] top_remnants.sort( key=lambda n: self.HEAD_PROMOTION.get(n.parent.deprel, 0)) deprels = [n.parent.deprel for n in top_remnants] self._recursive_fix_remnants(top_remnants, deprels, first_conjunct)
def fix_remnants_in_tree(self, root): """Change ellipsis with remnant deprels to UDv2 ellipsis with orphans. Remnant's parent is always the correlate (same-role) node. Usually, correlate's parent is the head of the whole ellipsis subtree, i.e. the first conjunct. However, sometimes remnants are deeper, e.g. 'Over 300 Iraqis are reported dead and 500 wounded.' with edges:: nsubjpass(reported, Iraqis) nummod(Iraqis, 300) remnant(300, 500) Let's expect all remnants in one tree are part of the same ellipsis structure. TODO: theoretically, there may be more ellipsis structures with remnants in one tree, but I have no idea how to distinguish them from the deeper-remnants cases. """ remnants = [n for n in root.descendants if n.deprel == 'remnant'] if not remnants: return (first_conjunct, _) = find_minimal_common_treelet(remnants[0].parent.parent, *remnants) if first_conjunct == root: self.log(remnants[0], 'remnant', "remnants' (+their grandpas') common governor is root") return # top_remnants = remnants with non-remnant parent, # other (so-called "chained") remnants will be solved recursively. top_remnants = [n for n in remnants if n.parent.deprel != 'remnant'] top_remnants.sort(key=lambda n: self.HEAD_PROMOTION.get(n.parent.deprel, 0)) deprels = [n.parent.deprel for n in top_remnants] self._recursive_fix_remnants(top_remnants, deprels, first_conjunct)
def test_topology(self): """Test methods/properties descendants, children, prev_node, next_node, ord.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) self.assertEqual(len(doc.bundles), 1) root = doc.bundles[0].get_tree() nodes = root.descendants nodes2 = root.descendants() # descendants() and descendants should return the same sequence of nodes self.assertEqual(nodes, nodes2) self.assertEqual(len(nodes), 6) self.assertEqual(nodes[1].parent, root) self.assertEqual(nodes[2].root, root) self.assertEqual(len(nodes[1].descendants), 5) self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) self.assertEqual(nodes[5].next_node, None) self.assertEqual(root.prev_node, None) (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1]) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), []) input_nodes = [nodes[2], nodes[4], nodes[5]] (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), [nodes[1], nodes[3]]) # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) self.assertTrue(nodes[0].precedes(nodes[1])) self.assertTrue(nodes[0] < nodes[1]) self.assertFalse(nodes[0] > nodes[1]) self.assertTrue(nodes[0] <= nodes[0]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])
def find_head(self, mention): mwords = set(mention.words) # First, check the simplest case: no empty words and a treelet in basic dependencies. basic_heads = [w for w in mention.words if not w.parent or not w.parent in mwords] assert basic_heads if len(basic_heads) == 1: return basic_heads[0], 'treelet' # Second, check also enhanced dependencies (but only within basic_heads for simplicity). enh_heads = [w for w in basic_heads if not any(p in mwords for p in self._eparents(w))] if not enh_heads: enh_heads = [w for w in basic_heads if not all(p in mwords for p in self._eparents(w))] if not enh_heads: return mention.head, 'cycle' if len(enh_heads) == 1: return enh_heads[0], 'treelet' # Third, find non-empty parents (ancestors in future) of empty nodes. empty_nodes, non_empty = [], [] for w in enh_heads: (empty_nodes if w.is_empty() else non_empty).append(w) if empty_nodes: for empty_node in empty_nodes: parents = [d['parent'] for d in empty_node.deps if not d['parent'].is_empty()] if parents: if parents[0] not in non_empty: non_empty.append(parents[0]) else: # TODO we should climb up, but preventing cycles # We could also introduce empty_node.nonempty_ancestor if 'warn' in self.bugs: logging.warning(f"could not find non-empty parent of {empty_node} for mention {mention.head}") if 'mark' in self.bugs: node.misc['Bug'] = 'no-parent-of-empty' non_empty.sort() # Fourth, check if there is a node within the enh_heads governing all the mention nodes # and forming thus a "gappy treelet", where the head is clearly the "highest" node. (highest, added_nodes) = find_minimal_common_treelet(*non_empty) if highest in enh_heads: return highest, 'gappy' if highest in mwords: if 'warn' in self.bugs: logging.warning(f"Strange mention {mention.head} with highest node {highest}") if 'mark' in self.bugs: highest.misc['Bug'] = 'highest-in-mwords' mention.head.misc['Bug'] = 'highest-head' # Fifth, try to convervatively preserve the original head, if it is one of the possible heads. if mention.head in enh_heads: return mention.head, 'nontreelet' # Finally, return the word-order-wise first head candidate as the head. return enh_heads[0], 'nontreelet'
def test_topology(self): """Test methods/properties descendants, children, prev_node, next_node, ord.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) self.assertEqual(len(doc.bundles), 1) root = doc.bundles[0].get_tree() nodes = root.descendants nodes2 = root.descendants() # descendants() and descendants should return the same sequence of nodes self.assertEqual(nodes, nodes2) self.assertEqual(len(nodes), 6) self.assertEqual(nodes[1].parent, root) self.assertEqual(nodes[2].root, root) self.assertEqual(len(nodes[1].descendants), 5) self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) self.assertEqual(nodes[5].next_node, None) self.assertEqual(root.prev_node, None) (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1]) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), []) input_nodes = [nodes[2], nodes[4], nodes[5]] (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), [nodes[1], nodes[3]]) # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])