def ptb_read_tree(source, return_empty=False, allow_empty_labels=False, allow_empty_words=False, blank_line_coverage=False): """Read a single tree from the given PTB file. The function reads a character at a time, stopping as soon as a tree can be constructed, so multiple trees on a sinlge line are manageable. >>> from io import StringIO >>> file_text = '''(ROOT (S ... (NP-SBJ (NNP Scotty) ) ... (VP (VBD did) (RB not) ... (VP (VB go) ... (ADVP (RB back) ) ... (PP (TO to) ... (NP (NN school) )))) ... (. .) ))''' >>> in_file = StringIO(file_text) >>> ptb_read_tree(in_file) (ROOT (S (NP-SBJ (NNP Scotty)) (VP (VBD did) (RB not) (VP (VB go) (ADVP (RB back)) (PP (TO to) (NP (NN school))))) (. .)))""" cur_text = '' depth = 0 while True: char = source.read(1) if char == '': return None break if char == '\n' and cur_text == ' ' and blank_line_coverage: return "Empty" if char in '\n\t': char = ' ' cur_text += char if char == '(': depth += 1 elif char == ')': depth -= 1 if depth == 0: if '()' in cur_text: if return_empty: return "Empty" cur_text = '' continue if '(' in cur_text: break tree = tree_from_text(cur_text, allow_empty_labels, allow_empty_words) ptb_cleaning(tree) return tree
def conll_read_tree(source, return_empty=False, allow_empty_labels=False, allow_empty_words=False, blank_line_coverage=False): """Read a single tree from the given CoNLL Shared Task OntoNotes data file. >>> from io import StringIO >>> in_file = StringIO(CONLL_EXAMPLE) >>> tree = conll_read_tree(in_file) >>> print(tree) (TOP (S (NP (PRP They)) (VP (MD will) (VP (VB remain) (PP (IN on) (NP (NP (DT a) (NML (JJR lower) (HYPH -) (NN priority)) (NN list)) (SBAR (WHNP (WDT that)) (S (VP (VBZ includes) (NP (CD 17) (JJ other) (NNS countries))))))))) (. .)))""" cur_text = [] while True: line = source.readline() # Check if we are out of input if line == '': return None # strip whitespace and see if this is then end of the parse line = line.strip() if line == '': break cur_text.append(line) text = '' for line in cur_text: if len(line) == 0 or line[0] == '#': continue line = line.split() # escape parentheses to avoid malformed trees word = line[3].replace('(', '-LRB-').replace(')', '-LRB-') try: pos = line[4].replace('(', '[').replace(')', ']') except IndexError: raise ValueError('conll file does not contain a POS tag column.') tree = line[5] tree = tree.split('*') text += '%s(%s %s)%s' % (tree[0], pos, word, tree[1]) return tree_from_text(text)
elif test_text == '': mprint("End of test input", out, 'err') break mprint("Sentence %d:" % sent_no, out, 'all') gold_text = gold_text.strip() test_text = test_text.strip() if len(gold_text) == 0: mprint("No gold tree", out, 'all') continue elif len(test_text) == 0: mprint("Not parsed", out, 'all') continue gold_complete_tree = pstree.tree_from_text(gold_text) treebanks.ptb_cleaning(gold_complete_tree) gold_tree = treebanks.apply_collins_rules(gold_complete_tree, False) if gold_tree is None: mprint("Empty gold tree", out, 'all') mprint(gold_complete_tree.__repr__(), out, 'all') mprint(gold_tree.__repr__(), out, 'all') continue if '()' in test_text: mprint("() test tree", out, 'all') continue test_complete_tree = pstree.tree_from_text(test_text) treebanks.ptb_cleaning(test_complete_tree) test_tree = treebanks.apply_collins_rules(test_complete_tree, False) if test_tree is None:
gold_text = gold_text.strip() test_text = test_text.strip() gold_relaxed_text = flatten_edited_nodes(gold_text) test_relaxed_text = flatten_edited_nodes(test_text) print test_text print test_relaxed_text if len(gold_text) == 0: mprint("No gold tree", out, 'all') continue elif len(test_text) == 0: mprint("Not parsed", out, 'all') continue gold_complete_tree = pstree.tree_from_text(gold_text, allow_empty_labels=True) gold_complete_tree = treebanks.homogenise_tree(gold_complete_tree) treebanks.ptb_cleaning(gold_complete_tree) gold_tree = treebanks.apply_collins_rules(gold_complete_tree, False) gold_relaxed_tree = pstree.tree_from_text(gold_relaxed_text, allow_empty_labels=True) gold_relaxed_tree = treebanks.homogenise_tree(gold_relaxed_tree) treebanks.ptb_cleaning(gold_relaxed_tree) gold_relaxed_tree = treebanks.apply_collins_rules( gold_relaxed_tree, False) if gold_tree is None: mprint("Empty gold tree", out, 'all') mprint(gold_complete_tree.__repr__(), out, 'all') mprint(gold_tree.__repr__(), out, 'all') continue
#!/usr/bin/env python # -*- coding: utf-8 -*- # vim: set ts=2 sw=2 noet: import sys from nlp_util import pstree, head_finder def headed_tree(tree, head_map, depth=0): ans = '' if depth > 0: ans = '\n' + depth * '\t' ans += '(' + tree.label + ' ' + str(head_finder.get_head(head_map, tree)[1]) if tree.word is not None: ans += ' ' + tree.word for subtree in tree.subtrees: ans += headed_tree(subtree, head_map, depth + 1) ans += ')' return ans if __name__ == '__main__': print "Running doctest" import doctest doctest.testmod() tree = pstree.tree_from_text("(ROOT (SINV (S (NP (PRP It)) (VP (AUX 's) (NP (NP (DT a) (NN problem)) (SBAR (WHNP (WDT that)) (S (ADVP (RB clearly)) (VP (AUX has) (S (VP (TO to) (VP (VB be) (VP (VBN resolved))))))))))) (VP (VBD said)) (NP (NP (NNP David) (NNP Cooke)) (NP (NP (JJ executive) (NN director)) (PP (IN of) (NP (DT the) (NNP RTC)))))))") head_map = head_finder.pennconverter_find_heads(tree) print headed_tree(tree, head_map)
def compute_overall_score(gold_file, test_file): gold_in = open(gold_file).readlines() test_in = open(test_file).readlines() stats = {'out_evalb': [0, 0, 0], 'out_relaxed': [0, 0, 0]} assert len(gold_in) == len(test_in) for i in range(len(gold_in)): print "Sent: " + str(i) gold_text = gold_in[i] test_text = test_in[i] if gold_text == '' and test_text == '': break elif gold_text == '': break elif test_text == '': break gold_text = gold_text.strip() test_text = test_text.strip() if len(gold_text) == 0: continue elif len(test_text) == 0: continue gold_complete_tree = pstree.tree_from_text(gold_text, allow_empty_labels=True) gold_complete_tree = treebanks.homogenise_tree(gold_complete_tree) treebanks.ptb_cleaning(gold_complete_tree) gold_tree = gold_complete_tree #gold_tree = treebanks.apply_collins_rules(gold_complete_tree, False) test_complete_tree = pstree.tree_from_text(test_text, allow_empty_labels=True) test_complete_tree = treebanks.homogenise_tree(test_complete_tree) treebanks.ptb_cleaning(test_complete_tree) test_tree = test_complete_tree #test_tree = treebanks.apply_collins_rules(test_complete_tree, False) gold_words = gold_tree.word_yield() test_words = test_tree.word_yield() if len(test_words.split()) != len(gold_words.split()): print "Sentence lengths do not match in sentence..." + str(i) print "Gold: " + gold_words.__repr__() print "Test: " + test_words.__repr__() match_strict, gold_strict, test_strict, _, _ = relaxed_parse_errors.counts_for_prf( test_tree, gold_tree) match_relaxed, gold_relaxed, test_relaxed, _, _ = relaxed_parse_errors.relaxed_counts_for_prf( test_tree, gold_tree) stats['out_evalb'][0] += match_strict stats['out_evalb'][1] += gold_strict stats['out_evalb'][2] += test_strict p, r, f = nlp_eval.calc_prf(match_strict, gold_strict, test_strict) print "Eval--Strict Evalb: %.2f %.2f %.2f" % (p * 100, r * 100, f * 100) stats['out_relaxed'][0] += match_relaxed stats['out_relaxed'][1] += gold_relaxed stats['out_relaxed'][2] += test_relaxed p, r, f = nlp_eval.calc_prf(match_relaxed, gold_relaxed, test_relaxed) print "Eval--Relaxed Edit: %.2f %.2f %.2f" % (p * 100, r * 100, f * 100) match = stats['out_evalb'][0] gold = stats['out_evalb'][1] test = stats['out_evalb'][2] p, r, f = nlp_eval.calc_prf(match, gold, test) print "Overall--Standard EVALB %s: %.2f %.2f %.2f" % ('out', p * 100, r * 100, f * 100) match = stats['out_relaxed'][0] gold = stats['out_relaxed'][1] test = stats['out_relaxed'][2] p, r, f = nlp_eval.calc_prf(match, gold, test) print "Overall--Relaxed EDIT %s: %.2f %.2f %.2f" % ('out', p * 100, r * 100, f * 100)