def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ("─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type
def load(): from udapi.core.document import Document load, read, write, text, relchain, save = [], [], [], [], [], [] for _ in range(30): start = timeit.default_timer() document = Document() document.load_conllu('cs-ud-train-l.conllu') end = timeit.default_timer() load.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: for node in root.descendants: form_lemma = node.form + node.lemma end = timeit.default_timer() read.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: chain = [n for n in root.descendants if n.deprel == "case" and n.parent.deprel == "nmod"] end = timeit.default_timer() relchain.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: for node in root.descendants: node.deprel = 'dep' end = timeit.default_timer() write.append(end - start) start = timeit.default_timer() for bundle in document: for root in bundle: root.compute_text() end = timeit.default_timer() text.append(end - start) start = timeit.default_timer() document.store_conllu('hello.conllu') end = timeit.default_timer() save.append(end - start) for x, y in [('load', load), ('read', read), ('write', write), ('text', text), ('relchain', relchain), ('save', save)]: print("{}\t{} +/- {}".format(x, round(np.mean(y), 2), round(np.std(y), 2)))
def test_topology(self): """Test methods/properties descendants, children, prev_node, next_node, ord.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) self.assertEqual(len(doc.bundles), 1) root = doc.bundles[0].get_tree() nodes = root.descendants nodes2 = root.descendants() # descendants() and descendants should return the same sequence of nodes self.assertEqual(nodes, nodes2) self.assertEqual(len(nodes), 6) self.assertEqual(nodes[1].parent, root) self.assertEqual(nodes[2].root, root) self.assertEqual(len(nodes[1].descendants), 5) self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) self.assertEqual(nodes[5].next_node, None) self.assertEqual(root.prev_node, None) (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1]) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), []) input_nodes = [nodes[2], nodes[4], nodes[5]] (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), [nodes[1], nodes[3]]) # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) self.assertTrue(nodes[0].precedes(nodes[1])) self.assertTrue(nodes[0] < nodes[1]) self.assertFalse(nodes[0] > nodes[1]) self.assertTrue(nodes[0] <= nodes[0]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])
def extract_senseid_children_collocates(conllu_filename): D = Document() D.load_conllu(conllu_filename ) #'Chinese_train_pos.xml.utf8.sentences.conllu.senseid') target_senseid_deprel_form_bundles = Vividict() #defaultdict(dict) for bundle in D.bundles: setattr_words(bundle=bundle) node = bundle.get_tree() while node: target = node.form senseid = node.misc['senseid'] if senseid: # For a verb like 想, list all children of the sense node: for child in node.children: if target_senseid_deprel_form_bundles[target][senseid][ child.deprel][child.form] == {}: target_senseid_deprel_form_bundles[target][senseid][ child.deprel][child.form] = [bundle] else: target_senseid_deprel_form_bundles[target][senseid][ child.deprel][child.form].append(bundle) node = node.next_node # To convert back to a common dictionaryu instance: d = dict(target_senseid_deprel_form_bundles) for target, senseid_deprel_form_bundles in target_senseid_deprel_form_bundles.items( ): d[target] = dict(senseid_deprel_form_bundles) for senseid, deprel_form_bundles in senseid_deprel_form_bundles.items( ): d[target][senseid] = dict(deprel_form_bundles) for deprel, form_bundles in deprel_form_bundles.items(): #d[target][senseid][deprel]=dict(form_bundles) sorted_form_bundles = sorted( form_bundles.items(), key=lambda form_bundles: len(form_bundles[1]), reverse=True) d[target][senseid][deprel] = OrderedDict(sorted_form_bundles) return d
def test_topology(self): """Test methods/properties descendants, children, prev_node, next_node, ord.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) self.assertEqual(len(doc.bundles), 1) root = doc.bundles[0].get_tree() nodes = root.descendants nodes2 = root.descendants() # descendants() and descendants should return the same sequence of nodes self.assertEqual(nodes, nodes2) self.assertEqual(len(nodes), 6) self.assertEqual(nodes[1].parent, root) self.assertEqual(nodes[2].root, root) self.assertEqual(len(nodes[1].descendants), 5) self.assertEqual(len(nodes[1].children), 3) self.assertEqual(len(nodes[1].children(add_self=True)), 4) self.assertEqual(len(nodes[1].children(add_self=1, following_only=1)), 3) self.assertEqual(nodes[0].next_node, nodes[1]) self.assertEqual(nodes[2].prev_node, nodes[1]) self.assertEqual(nodes[5].next_node, None) self.assertEqual(root.prev_node, None) (common_ancestor, added_nodes) = find_minimal_common_treelet(nodes[0], nodes[1]) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), []) input_nodes = [nodes[2], nodes[4], nodes[5]] (common_ancestor, added_nodes) = find_minimal_common_treelet(*input_nodes) self.assertEqual(common_ancestor, nodes[1]) self.assertEqual(list(added_nodes), [nodes[1], nodes[3]]) # ords and reorderings self.assertEqual([node.ord for node in nodes], [1, 2, 3, 4, 5, 6]) nodes[0].shift_after_node(nodes[1]) self.assertEqual([node.ord for node in nodes], [2, 1, 3, 4, 5, 6]) self.assertEqual([node.ord for node in root.descendants()], [1, 2, 3, 4, 5, 6])
def load(): from udapi.core.document import Document document = Document() document.load_conllu('cs-ud-train-l.conllu') for bundle in document: for root in bundle: for node in root.descendants: form_lemma = node.form + node.lemma for bundle in document: for root in bundle: chain = [n for n in root.descendants if n.parent.deprel == "det" and n.parent.parent.deprel == "obj"] for bundle in document: for root in bundle: for node in root.descendants: node.deprel = 'dep' for bundle in document: for root in bundle: root.compute_text() document.store_conllu('hello.conllu')
from udapi.core.document import Document D = Document() D.load_conllu( 'SemEval-2007/Chinese_train_pos.xml.utf8.sentences.conllu.senseid') for bundle in D.bundles: bundle.words = [] node = bundle.get_tree() while node: bundle.words.append(node.form) node = node.next_node print(bundle.bundle_id, bundle.words)
def test_print_subtree(self): """Test print_subtree() method, which uses udapi.block.write.textmodetrees.""" doc = Document() data_filename = os.path.join(os.path.dirname(__file__), 'data', 'enh_deps.conllu') doc.load_conllu(data_filename) root = doc.bundles[0].get_tree() expected1 = ("# sent_id = a-mf920901-001-p1s1A\n" "# text = Slovenská ústava: pro i proti\n" "─┮\n" " │ ╭─╼ Slovenská ADJ amod\n" " ╰─┾ ústava NOUN root\n" " ┡─╼ : PUNCT punct\n" " ╰─┮ pro ADP appos\n" " ┡─╼ i CONJ cc\n" " ╰─╼ proti ADP conj\n" "\n") expected2 = ( "─┮\n" " │ ╭─╼ Slovenská Case=Nom|Degree=Pos|Gender=Fem|Negative=Pos|Number=Sing _\n" " ╰─┾ ústava Case=Nom|Gender=Fem|Negative=Pos|Number=Sing SpaceAfter=No\n" " ┡─╼ : _ _\n" " ╰─┮ pro AdpType=Prep|Case=Acc LId=pro-1\n" " ┡─╼ i _ LId=i-1\n" " ╰─╼ proti AdpType=Prep|Case=Dat LId=proti-1\n" "\n") # test non-projective tree root3 = Root() for i in range(1, 5): root3.create_child(form=str(i)) nodes = root3.descendants(add_self=1) nodes[1].parent = nodes[3] nodes[4].parent = nodes[2] expected3 = ("─┮\n" " │ ╭─╼ 1\n" " ┡─╪───┮ 2\n" " ╰─┶ 3 │\n" " ╰─╼ 4\n" "\n") try: sys.stdout = capture = io.StringIO() root.print_subtree(color=False) self.assertEqual(capture.getvalue(), expected1) capture.seek(0) capture.truncate() root.print_subtree(color=False, attributes='form,feats,misc', print_sent_id=False, print_text=False) self.assertEqual(capture.getvalue(), expected2) capture.seek(0) capture.truncate() root3.print_subtree(color=False, attributes='form', print_sent_id=0, print_text=0) self.assertEqual(capture.getvalue(), expected3) finally: sys.stdout = sys.__stdout__ # pylint: disable=redefined-variable-type