def fall_back_left_branching_token(clean_tokens): tree = HybridTree() for i, token in enumerate(clean_tokens): token.set_edge_label('_') tree.add_node(i, token, True) if i == 0: tree.add_to_root(i) else: tree.add_child(i - 1, i) return tree
def fall_back_left_branching(forms, poss): tree = HybridTree() for i, (form, pos) in enumerate(zip(forms, poss)): token = construct_conll_token(form, pos) token.set_edge_label('_') tree.add_node(i, token, True) if i == 0: tree.add_to_root(i) else: tree.add_child(i - 1, i) return tree
def multi_dep_tree(): tree = HybridTree('multi') tree.add_node('1', CoNLLToken('A', '_', 'pA', 'pA', '_', 'dA'), True) tree.add_node('211', CoNLLToken('B', '_', 'pB', 'pB', '_', 'dB'), True) tree.add_node('11', CoNLLToken('C', '_', 'pC', 'pC', '_', 'dC'), True) tree.add_node('2', CoNLLToken('D', '_', 'pD', 'pD', '_', 'dD'), True) tree.add_node('21', CoNLLToken('E', '_', 'pE', 'pE', '_', 'dE'), True) tree.add_to_root('2') tree.add_to_root('1') for c in ['21', '211']: tree.add_child('2', c) tree.add_child('1', '11') tree.reorder() return tree
def disconnect_punctuation(trees): """ :param trees: corpus of hybrid trees :type trees: __generator[HybridTree] :return: corpus of hybrid trees :rtype: __generator[GeneralHybridTree] lazily disconnect punctuation from each hybrid tree in a corpus of hybrid trees """ for tree in trees: tree2 = HybridTree(tree.sent_label()) for root_id in tree.root: if not is_punctuation(tree.node_token(root_id).form()): tree2.add_to_root(root_id) for id in tree.full_yield(): token = tree.node_token(id) if not is_punctuation(token.form()): parent = tree.parent(id) while parent and parent not in tree.root and is_punctuation( tree.node_token(parent).form()): parent = tree.parent(parent) if parent and is_punctuation(tree.node_token(parent).form()): tree2.add_to_root(id) else: tree2.add_child(parent, id) tree2.add_node(id, token, True, True) else: tree2.add_node(id, token, True, False) if tree2: # basic sanity checks if not tree2.root \ and len(tree2.id_yield()) == 0 \ and len(tree2.nodes()) == len(tree2.full_yield()): # Tree consists only of punctuation continue elif not tree2.root \ or tree2.n_nodes() != len(tree2.id_yield()) \ or len(tree2.nodes()) != len(tree2.full_yield()): print(tree) print(tree2) print(tree2.sent_label()) print("Root:", tree2.root) print("Nodes: ", tree2.n_nodes()) print("Id_yield:", len(tree2.id_yield()), tree2.id_yield()) print("Nodes: ", len(tree2.nodes())) print("full yield: ", len(tree2.full_yield())) raise Exception() yield tree2
def hybrid_tree_2(): tree2 = HybridTree() tree2.add_node('v1', CoNLLToken('Piet', '_', 'NP', 'NP', '_', 'SBJ'), True) tree2.add_node('v211', CoNLLToken('Marie', '_', 'N', 'N', '_', 'OBJ'), True) tree2.add_node('v', CoNLLToken('helpen', '_', 'V', 'V', '_', 'ROOT'), True) tree2.add_node('v2', CoNLLToken('leren', '_', 'V', 'V', '_', 'VBI'), True) tree2.add_node('v21', CoNLLToken('lezen', '_', 'V', 'V', '_', 'VFIN'), True) tree2.add_child('v', 'v2') tree2.add_child('v', 'v1') tree2.add_child('v2', 'v21') tree2.add_child('v21', 'v211') tree2.add_to_root('v') tree2.reorder() return tree2
def test_recursive_partitioning_transformation(self): tree = HybridTree("mytree") ids = ['a', 'b', 'c', 'd'] for f in ids: tree.add_node(f, CoNLLToken(f, '_', '_', '_', '_', '_'), True, True) if f != 'a': tree.add_child('a', f) tree.add_to_root('a') print(tree) self.assertEqual([token.form() for token in tree.token_yield()], ids) self.assertEqual(tree.recursive_partitioning(), (set([0, 1, 2, 3]), [(set([0]), []), (set([1]), []), (set([2]), []), (set([3]), [])])) print(tree.recursive_partitioning()) [fanout_1 ] = the_recursive_partitioning_factory().get_partitioning('fanout-1') print(fanout_1(tree))
def multi_const_tree(): tree = HybridTree("multi") tree.add_node('1.1', ConstituentTerminal('A', 'pA'), True, True) tree.add_node('2.1', ConstituentTerminal('B', 'pB'), True, True) tree.add_node('1.2', ConstituentTerminal('C', 'pC'), True, True) tree.add_node('2.2', ConstituentTerminal('D', 'pD'), True, True) tree.add_node('1', ConstituentCategory('E'), False, True) tree.add_node('2', ConstituentCategory('F'), False, True) for p in ['2', '1']: tree.add_to_root(p) for c in ['1', '2']: tree.add_child(p, p + '.' + c) return tree
def derivation_to_hybrid_tree(der, poss, ordered_labels, construct_token, disconnected=None): """ :param der: :type der: LCFRSDerivation :param poss: list of POS-tags :type poss: list[str] :param ordered_labels: list of words :type ordered_labels: list[str] :param disconnected: list of positions in ordered_labels that are disconnected :type disconnected: list[object] :rtype: GeneralHybridTree Turn a derivation tree into a hybrid tree. Assuming poss and ordered_labels to have equal length. """ if not disconnected: disconnected = [] tree = HybridTree() j = 1 for i in range(len(ordered_labels)): token = construct_token(ordered_labels[i], poss[i], True) if i in disconnected: tree.add_node("d" + str(i), token, True, False) else: tree.add_node("c" + str(j), token, True, True) j += 1 for id in der.ids(): token = construct_token(der.getRule(id).lhs().nont(), '_', False) tree.add_node(id, token) for child in der.child_ids(id): tree.add_child(id, child) for position in der.terminal_positions(id): tree.add_child(id, "c" + str(position)) tree.add_to_root(der.root_id()) tree.reorder() return tree
def query_result_tree(connection, exp, tree_id): """ :param connection: :param exp: :param tree_id: :rtype: str, HybridTree :return: """ cursor = connection.cursor() result_tree_ids = cursor.execute( '''SELECT rt_id, status FROM result_trees WHERE exp_id = ? AND t_id = ?''', (exp, tree_id)).fetchall() # parse: if result_tree_ids: assert (len(result_tree_ids) == 1) result_tree_id, status = result_tree_ids[0] if status in ["parse", "fallback"]: name = cursor.execute('''SELECT name FROM trees WHERE t_id = ?''', (tree_id, )).fetchall()[0][0] tree_nodes = cursor.execute(( ' SELECT tree_nodes.sent_position, label, pos, result_tree_nodes.head, result_tree_nodes.deprel FROM result_tree_nodes\n' ' JOIN result_trees\n' ' ON result_tree_nodes.rt_id = result_trees.rt_id\n' ' JOIN tree_nodes\n' ' ON result_trees.t_id = tree_nodes.t_id\n' ' AND result_tree_nodes.sent_position = tree_nodes.sent_position\n' ' WHERE result_tree_nodes.rt_id = ?'), (result_tree_id, )) tree = HybridTree(name) for i, label, pos, head, deprel in tree_nodes: if deprel is None: deprel = 'UNKNOWN' token = CoNLLToken(label, '_', pos, pos, '_', deprel) tree.add_node(str(i), token, True, True) if head == 0: tree.add_to_root(str(i)) else: tree.add_child(str(head), str(i)) assert tree.root is not [] return status, tree # legacy: no entry found else: status = "simple_fallback" # Create a left branching tree without labels as default strategy tree_nodes = cursor.execute( ''' SELECT tree_nodes.sent_position, label, pos FROM tree_nodes WHERE tree_nodes.t_id = ?''', (tree_id, )).fetchall() left_branch = lambda x: x - 1 right_branch = lambda x: x + 1 strategy = left_branch length = len(tree_nodes) tree = HybridTree() for i, label, pos in tree_nodes: token = CoNLLToken(label, '_', pos, pos, '_', '_') tree.add_node(str(i), token, True, True) parent = strategy(i) if (parent == 0 and strategy == left_branch) or (parent == length + 1 and strategy == right_branch): tree.add_to_root(str(i)) else: tree.add_child(str(parent), str(i)) assert tree.root is not [] return status, tree
def parse_conll_corpus(path, ignore_punctuation, limit=sys.maxsize, start=0): """ :param path: path to corpus :type: str :param ignore_punctuation: exclude punctuation from tree structure :type ignore_punctuation: bool :param limit: stop generation after limit trees :type: int :param start: start generation with start'th tree :type start: int :return: a series of hybrid trees read from file :rtype: __generator[HybridTree] :raise Exception: unexpected input in corpus file Lazily parses a dependency corpus (in CoNLL format) and generates GeneralHybridTrees. """ # print path with open(path) as file_content: tree_count = 0 while tree_count < limit: tree = None try: line = next(file_content) while line.startswith('#'): line = next(file_content) except StopIteration: break match = CONLL_LINE.match(line) while match: if match.group(1) == '1': tree_count += 1 tree = HybridTree('tree' + str(tree_count)) node_id = match.group(1) form = match.group(2) lemma = match.group(3) cpos = match.group(4) pos = match.group(5) feats = match.group(6) parent = match.group(7) deprel = match.group(8) # We ignore information about multiple token's as present in the UD version of Prague Dep. TB if MULTI_TOKEN.search(node_id): pass else: # If punctuation is to be ignored, we # remove it from the hybrid tree # Punctuation according to definition # cf. http://ilk.uvt.nl/conll/software.html#eval # if not ignore_punctuation or form.translate(no_translation, string.punctuation): tree.add_node(node_id, CoNLLToken(form, lemma, cpos, pos, feats, deprel), True, True) if parent != '0': tree.add_child(parent, node_id) # else: # tree.add_node(node_id, CoNLLToken(form, lemma, pos, fine_grained_pos, feats, deprel), True, False) # TODO: If punctuation is ignored and the root is punctuation, # TODO: it is added to the tree anyhow. if parent == '0': tree.add_to_root(node_id) try: line = next(file_content) while line.startswith('#'): line = next(file_content) match = CONLL_LINE.search(line) except StopIteration: line = '' match = None # Assume empty line, otherwise raise exception match = EMPTY_LINE.match(line) if not match: raise Exception("Unexpected input in CoNLL corpus file.") if tree: # basic sanity checks if not tree.root: # FIXME: ignoring punctuation may leads to malformed trees print("non-rooted") if ignore_punctuation: continue raise Exception # elif root > 1: # FIXME: turkish corpus contains trees with more than one root # FIXME: currently, they are ignored # continue elif tree.n_nodes() != len(tree.id_yield()) or len(tree.nodes()) != len(tree.full_yield()): # FIXME: ignoring punctuation may leads to malformed trees if ignore_punctuation: continue raise Exception( '{4}: connected nodes: {0}, total nodes: {1}, full yield: {2}, connected yield: {3}'.format( str(tree.n_nodes()), str(len(tree.nodes())), str(len(tree.full_yield())), str(len(tree.id_yield())), tree.sent_label())) if tree_count > start: yield tree
def parse_with_pgf(grammar, forms, poss, bin): """" :type grammar: PGF :return: :rtype: """ lcfrs = grammar.languages[bin + 'grammargfconcrete'] # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_" sentence = ' '.join(map(escape, poss)) try: i = lcfrs.parse(sentence, n=1) p, e = next(i) except (StopIteration, pgf.ParseError): return None # print_ast(gr, e, 0) s = lcfrs.graphvizParseTree(e) assert isinstance(s, str) s_ = s.splitlines() tree = HybridTree() # print s i = 0 for line in s.splitlines(): match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line) if match: node_id = match.group(1) label = match.group(2) order = int(node_id[1:]) >= 100000 if order: assert escape(poss[i]) == label tree.add_node( node_id, construct_constituent_token(form=forms[i], pos=poss[i], terminal=True), True) i += 1 else: tree.add_node( node_id, construct_constituent_token(form=label, pos='_', terminal=False), False) # print node_id, label if label == 'VROOT1': tree.add_to_root(node_id) continue match = re.search(r'^ (n\d+) -- (n\d+)\s*$', line) if match: parent = match.group(1) child = match.group(2) tree.add_child(parent, child) # print line # print parent, child continue # print tree assert poss == [token.pos() for token in tree.token_yield()] # print the_yield dep_tree = HybridTree() head_table = defaultdict(lambda: None) attachment_point = defaultdict(lambda: None) for i, node in enumerate(tree.id_yield()): token = tree.node_token(node) dep_token = construct_conll_token(token.form(), un_escape(token.pos())) current = tree.parent(node) current = tree.parent(current) while current: current_label = tree.node_token(current).category() if not re.search(r'\d+X\d+$', current_label): s = un_escape(current_label) if s == 'TOP1': s = 'ROOT1' dep_token.set_edge_label(s[:-1]) head_table[current] = i + 1 attachment_point[node] = current break else: current = tree.parent(current) dep_tree.add_node(i + 1, dep_token, order=True) # print head_table for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()): node = tree.parent(attachment_point[node]) while node: if head_table[node]: dep_tree.add_child(head_table[node], dep_node) break node = tree.parent(node) if not node: dep_tree.add_to_root(dep_node) # print "dep_tree" # print dep_tree # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()]) return dep_tree
class GeneralHybridTreeTestCase(unittest.TestCase): tree = None def setUp(self): self.tree = HybridTree() self.tree.add_node("v1", construct_conll_token("Piet", "NP"), True) self.tree.add_node("v21", construct_conll_token("Marie", "N"), True) self.tree.add_node("v", construct_conll_token("helpen", "VP"), True) self.tree.add_node("v2", construct_conll_token("lezen", "V"), True) self.tree.add_child("v", "v2") self.tree.add_child("v", "v1") self.tree.add_child("v2", "v21") self.tree.add_node("v3", construct_conll_token(".", "Punc"), True, False) self.tree.add_to_root("v") def test_children(self): self.assertListEqual(self.tree.children('v'), ['v2', 'v1']) self.tree.reorder() self.assertListEqual(self.tree.children('v'), ['v1', 'v2']) def test_fringe(self): self.tree.reorder() self.assertListEqual(self.tree.fringe('v'), [2, 0, 3, 1]) self.assertListEqual(self.tree.fringe('v2'), [3, 1]) def test_n_spans(self): self.tree.reorder() self.assertEqual(self.tree.n_spans('v'), 1) self.assertEqual(self.tree.n_spans('v2'), 2) def test_n_gaps(self): self.tree.reorder() self.assertEqual(self.tree.n_gaps(), 1) def test_node_ids(self): self.tree.reorder() self.assertListEqual(sorted(self.tree.nodes()), sorted(['v', 'v1', 'v2', 'v21', 'v3'])) def test_complete(self): self.tree.reorder() self.assertEqual(self.tree.complete(), True) def test_unlabelled_structure(self): self.tree.reorder() self.assertTupleEqual(self.tree.unlabelled_structure(), ({0, 1, 2, 3}, [({0}, []), ({1, 3}, [({1}, [])])])) def test_max_n_spans(self): self.tree.reorder() self.assertEqual(self.tree.max_n_spans(), 2) def test_labelled_yield(self): self.tree.reorder() self.assertListEqual( [token.form() for token in self.tree.token_yield()], "Piet Marie helpen lezen".split(' ')) def test_full_labelled_yield(self): self.tree.reorder() self.assertListEqual( [token.form() for token in self.tree.full_token_yield()], "Piet Marie helpen lezen .".split(' ')) def test_full_yield(self): self.tree.reorder() self.assertListEqual(self.tree.full_yield(), 'v1 v21 v v2 v3'.split(' ')) # def test_labelled_spans(self): # self.tree.reorder() # self.assertListEqual(self.tree.labelled_spans(), []) def test_pos_yield(self): self.tree.reorder() self.assertListEqual( [token.pos() for token in self.tree.token_yield()], "NP N VP V".split(' ')) def test_recursive_partitioning(self): self.tree.reorder() self.assertEqual(self.tree.recursive_partitioning(), ({0, 1, 2, 3}, [({0}, []), ({1, 3}, [({1}, []), ({3}, [])]), ({2}, [])]))