def disconnect_punctuation(trees): """ :param trees: corpus of hybrid trees :type trees: __generator[HybridTree] :return: corpus of hybrid trees :rtype: __generator[GeneralHybridTree] lazily disconnect punctuation from each hybrid tree in a corpus of hybrid trees """ for tree in trees: tree2 = HybridTree(tree.sent_label()) for root_id in tree.root: if not is_punctuation(tree.node_token(root_id).form()): tree2.add_to_root(root_id) for id in tree.full_yield(): token = tree.node_token(id) if not is_punctuation(token.form()): parent = tree.parent(id) while parent and parent not in tree.root and is_punctuation( tree.node_token(parent).form()): parent = tree.parent(parent) if parent and is_punctuation(tree.node_token(parent).form()): tree2.add_to_root(id) else: tree2.add_child(parent, id) tree2.add_node(id, token, True, True) else: tree2.add_node(id, token, True, False) if tree2: # basic sanity checks if not tree2.root \ and len(tree2.id_yield()) == 0 \ and len(tree2.nodes()) == len(tree2.full_yield()): # Tree consists only of punctuation continue elif not tree2.root \ or tree2.n_nodes() != len(tree2.id_yield()) \ or len(tree2.nodes()) != len(tree2.full_yield()): print(tree) print(tree2) print(tree2.sent_label()) print("Root:", tree2.root) print("Nodes: ", tree2.n_nodes()) print("Id_yield:", len(tree2.id_yield()), tree2.id_yield()) print("Nodes: ", len(tree2.nodes())) print("full yield: ", len(tree2.full_yield())) raise Exception() yield tree2
def parse_conll_corpus(path, ignore_punctuation, limit=sys.maxsize, start=0): """ :param path: path to corpus :type: str :param ignore_punctuation: exclude punctuation from tree structure :type ignore_punctuation: bool :param limit: stop generation after limit trees :type: int :param start: start generation with start'th tree :type start: int :return: a series of hybrid trees read from file :rtype: __generator[HybridTree] :raise Exception: unexpected input in corpus file Lazily parses a dependency corpus (in CoNLL format) and generates GeneralHybridTrees. """ # print path with open(path) as file_content: tree_count = 0 while tree_count < limit: tree = None try: line = next(file_content) while line.startswith('#'): line = next(file_content) except StopIteration: break match = CONLL_LINE.match(line) while match: if match.group(1) == '1': tree_count += 1 tree = HybridTree('tree' + str(tree_count)) node_id = match.group(1) form = match.group(2) lemma = match.group(3) cpos = match.group(4) pos = match.group(5) feats = match.group(6) parent = match.group(7) deprel = match.group(8) # We ignore information about multiple token's as present in the UD version of Prague Dep. TB if MULTI_TOKEN.search(node_id): pass else: # If punctuation is to be ignored, we # remove it from the hybrid tree # Punctuation according to definition # cf. http://ilk.uvt.nl/conll/software.html#eval # if not ignore_punctuation or form.translate(no_translation, string.punctuation): tree.add_node(node_id, CoNLLToken(form, lemma, cpos, pos, feats, deprel), True, True) if parent != '0': tree.add_child(parent, node_id) # else: # tree.add_node(node_id, CoNLLToken(form, lemma, pos, fine_grained_pos, feats, deprel), True, False) # TODO: If punctuation is ignored and the root is punctuation, # TODO: it is added to the tree anyhow. if parent == '0': tree.add_to_root(node_id) try: line = next(file_content) while line.startswith('#'): line = next(file_content) match = CONLL_LINE.search(line) except StopIteration: line = '' match = None # Assume empty line, otherwise raise exception match = EMPTY_LINE.match(line) if not match: raise Exception("Unexpected input in CoNLL corpus file.") if tree: # basic sanity checks if not tree.root: # FIXME: ignoring punctuation may leads to malformed trees print("non-rooted") if ignore_punctuation: continue raise Exception # elif root > 1: # FIXME: turkish corpus contains trees with more than one root # FIXME: currently, they are ignored # continue elif tree.n_nodes() != len(tree.id_yield()) or len(tree.nodes()) != len(tree.full_yield()): # FIXME: ignoring punctuation may leads to malformed trees if ignore_punctuation: continue raise Exception( '{4}: connected nodes: {0}, total nodes: {1}, full yield: {2}, connected yield: {3}'.format( str(tree.n_nodes()), str(len(tree.nodes())), str(len(tree.full_yield())), str(len(tree.id_yield())), tree.sent_label())) if tree_count > start: yield tree
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation.get_trees(): tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = map(lambda x: x[1], parser.k_best_derivation_trees()) manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_derivations_to_hypergraphs(derivations) scores = [] gold_labels = {} gold_heads = {} for position, id in enumerate(gold_tree.id_yield()): parent_id = gold_tree.parent(id) gold_labels[position] = gold_tree.node_token(id).deprel() if parent_id is None: assert id in gold_tree.root gold_heads[position] = 0 else: gold_heads[position] = gold_tree.id_yield().index( parent_id) + 1 derivations = parser.k_best_derivation_trees() for _, der in derivations: der_count += 1 h_tree = HybridTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) las, uas, lac = 0, 0, 0 for position, id in enumerate(h_tree.id_yield()): parent_id = h_tree.parent(id) if parent_id is None: assert id in h_tree.root head = 0 else: head = h_tree.id_yield().index(parent_id) + 1 label = h_tree.node_token(id).deprel() if gold_heads[position] == head: uas += 1 if gold_labels[position] == label: lac += 1 if gold_heads[position] == head and gold_labels[ position] == label: las += 1 if validationMethod == "LAS": scores.append(las) elif validationMethod == "UAS": scores.append(uas) elif validationMethod == "LAC": scores.append(lac) max_score = len(gold_tree.id_yield()) validator.add_scored_candidates(manager, scores, max_score) print(tree_count, max_score, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator
def parse_with_pgf(grammar, forms, poss, bin): """" :type grammar: PGF :return: :rtype: """ lcfrs = grammar.languages[bin + 'grammargfconcrete'] # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_" sentence = ' '.join(map(escape, poss)) try: i = lcfrs.parse(sentence, n=1) p, e = next(i) except (StopIteration, pgf.ParseError): return None # print_ast(gr, e, 0) s = lcfrs.graphvizParseTree(e) assert isinstance(s, str) s_ = s.splitlines() tree = HybridTree() # print s i = 0 for line in s.splitlines(): match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line) if match: node_id = match.group(1) label = match.group(2) order = int(node_id[1:]) >= 100000 if order: assert escape(poss[i]) == label tree.add_node( node_id, construct_constituent_token(form=forms[i], pos=poss[i], terminal=True), True) i += 1 else: tree.add_node( node_id, construct_constituent_token(form=label, pos='_', terminal=False), False) # print node_id, label if label == 'VROOT1': tree.add_to_root(node_id) continue match = re.search(r'^ (n\d+) -- (n\d+)\s*$', line) if match: parent = match.group(1) child = match.group(2) tree.add_child(parent, child) # print line # print parent, child continue # print tree assert poss == [token.pos() for token in tree.token_yield()] # print the_yield dep_tree = HybridTree() head_table = defaultdict(lambda: None) attachment_point = defaultdict(lambda: None) for i, node in enumerate(tree.id_yield()): token = tree.node_token(node) dep_token = construct_conll_token(token.form(), un_escape(token.pos())) current = tree.parent(node) current = tree.parent(current) while current: current_label = tree.node_token(current).category() if not re.search(r'\d+X\d+$', current_label): s = un_escape(current_label) if s == 'TOP1': s = 'ROOT1' dep_token.set_edge_label(s[:-1]) head_table[current] = i + 1 attachment_point[node] = current break else: current = tree.parent(current) dep_tree.add_node(i + 1, dep_token, order=True) # print head_table for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()): node = tree.parent(attachment_point[node]) while node: if head_table[node]: dep_tree.add_child(head_table[node], dep_node) break node = tree.parent(node) if not node: dep_tree.add_to_root(dep_node) # print "dep_tree" # print dep_tree # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()]) return dep_tree