示例#1
0
def ptb_read_tree(source,
                  return_empty=False,
                  allow_empty_labels=False,
                  allow_empty_words=False,
                  blank_line_coverage=False):
    """Read a single tree from the given PTB file.

	The function reads a character at a time, stopping as soon as a tree can be
	constructed, so multiple trees on a sinlge line are manageable.

	>>> from io import StringIO
	>>> file_text = '''(ROOT (S
	...   (NP-SBJ (NNP Scotty) )
	...   (VP (VBD did) (RB not)
	...     (VP (VB go)
	...       (ADVP (RB back) )
	...       (PP (TO to)
	...         (NP (NN school) ))))
	...   (. .) ))'''
	>>> in_file = StringIO(file_text)
	>>> ptb_read_tree(in_file)
	(ROOT (S (NP-SBJ (NNP Scotty)) (VP (VBD did) (RB not) (VP (VB go) (ADVP (RB back)) (PP (TO to) (NP (NN school))))) (. .)))"""
    cur_text = ''
    depth = 0
    while True:
        char = source.read(1)
        if char == '':
            return None
            break
        if char == '\n' and cur_text == ' ' and blank_line_coverage:
            return "Empty"
        if char in '\n\t':
            char = ' '
        cur_text += char
        if char == '(':
            depth += 1
        elif char == ')':
            depth -= 1
        if depth == 0:
            if '()' in cur_text:
                if return_empty:
                    return "Empty"
                cur_text = ''
                continue
            if '(' in cur_text:
                break

    tree = tree_from_text(cur_text, allow_empty_labels, allow_empty_words)
    ptb_cleaning(tree)
    return tree
示例#2
0
def conll_read_tree(source,
                    return_empty=False,
                    allow_empty_labels=False,
                    allow_empty_words=False,
                    blank_line_coverage=False):
    """Read a single tree from the given CoNLL Shared Task OntoNotes data file.

	>>> from io import StringIO
	>>> in_file = StringIO(CONLL_EXAMPLE)
	>>> tree = conll_read_tree(in_file)
	>>> print(tree)
	(TOP (S (NP (PRP They)) (VP (MD will) (VP (VB remain) (PP (IN on) (NP (NP (DT a) (NML (JJR lower) (HYPH -) (NN priority)) (NN list)) (SBAR (WHNP (WDT that)) (S (VP (VBZ includes) (NP (CD 17) (JJ other) (NNS countries))))))))) (. .)))"""
    cur_text = []
    while True:
        line = source.readline()
        # Check if we are out of input
        if line == '':
            return None
        # strip whitespace and see if this is then end of the parse
        line = line.strip()
        if line == '':
            break
        cur_text.append(line)

    text = ''
    for line in cur_text:
        if len(line) == 0 or line[0] == '#':
            continue
        line = line.split()
        # escape parentheses to avoid malformed trees
        word = line[3].replace('(', '-LRB-').replace(')', '-LRB-')
        try:
            pos = line[4].replace('(', '[').replace(')', ']')
        except IndexError:
            raise ValueError('conll file does not contain a POS tag column.')
        tree = line[5]
        tree = tree.split('*')
        text += '%s(%s %s)%s' % (tree[0], pos, word, tree[1])
    return tree_from_text(text)
示例#3
0
        elif test_text == '':
            mprint("End of test input", out, 'err')
            break

        mprint("Sentence %d:" % sent_no, out, 'all')

        gold_text = gold_text.strip()
        test_text = test_text.strip()
        if len(gold_text) == 0:
            mprint("No gold tree", out, 'all')
            continue
        elif len(test_text) == 0:
            mprint("Not parsed", out, 'all')
            continue

        gold_complete_tree = pstree.tree_from_text(gold_text)
        treebanks.ptb_cleaning(gold_complete_tree)
        gold_tree = treebanks.apply_collins_rules(gold_complete_tree, False)
        if gold_tree is None:
            mprint("Empty gold tree", out, 'all')
            mprint(gold_complete_tree.__repr__(), out, 'all')
            mprint(gold_tree.__repr__(), out, 'all')
            continue

        if '()' in test_text:
            mprint("() test tree", out, 'all')
            continue
        test_complete_tree = pstree.tree_from_text(test_text)
        treebanks.ptb_cleaning(test_complete_tree)
        test_tree = treebanks.apply_collins_rules(test_complete_tree, False)
        if test_tree is None:
        gold_text = gold_text.strip()
        test_text = test_text.strip()

        gold_relaxed_text = flatten_edited_nodes(gold_text)
        test_relaxed_text = flatten_edited_nodes(test_text)
        print test_text
        print test_relaxed_text
        if len(gold_text) == 0:
            mprint("No gold tree", out, 'all')
            continue
        elif len(test_text) == 0:
            mprint("Not parsed", out, 'all')
            continue

        gold_complete_tree = pstree.tree_from_text(gold_text,
                                                   allow_empty_labels=True)
        gold_complete_tree = treebanks.homogenise_tree(gold_complete_tree)
        treebanks.ptb_cleaning(gold_complete_tree)
        gold_tree = treebanks.apply_collins_rules(gold_complete_tree, False)

        gold_relaxed_tree = pstree.tree_from_text(gold_relaxed_text,
                                                  allow_empty_labels=True)
        gold_relaxed_tree = treebanks.homogenise_tree(gold_relaxed_tree)
        treebanks.ptb_cleaning(gold_relaxed_tree)
        gold_relaxed_tree = treebanks.apply_collins_rules(
            gold_relaxed_tree, False)
        if gold_tree is None:
            mprint("Empty gold tree", out, 'all')
            mprint(gold_complete_tree.__repr__(), out, 'all')
            mprint(gold_tree.__repr__(), out, 'all')
            continue
示例#5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: set ts=2 sw=2 noet:

import sys
from nlp_util import pstree, head_finder

def headed_tree(tree, head_map, depth=0):
	ans = ''
	if depth > 0:
		ans = '\n' + depth * '\t'
	ans += '(' + tree.label + ' ' + str(head_finder.get_head(head_map, tree)[1])
	if tree.word is not None:
		ans += ' ' + tree.word
	for subtree in tree.subtrees:
		ans += headed_tree(subtree, head_map, depth + 1)
	ans += ')'
	return ans

if __name__ == '__main__':
	print "Running doctest"
	import doctest
	doctest.testmod()

	tree = pstree.tree_from_text("(ROOT (SINV (S (NP (PRP It)) (VP (AUX 's) (NP (NP (DT a) (NN problem)) (SBAR (WHNP (WDT that)) (S (ADVP (RB clearly)) (VP (AUX has) (S (VP (TO to) (VP (VB be) (VP (VBN resolved))))))))))) (VP (VBD said)) (NP (NP (NNP David) (NNP Cooke)) (NP (NP (JJ executive) (NN director)) (PP (IN of) (NP (DT the) (NNP RTC)))))))")
	head_map = head_finder.pennconverter_find_heads(tree)
	print headed_tree(tree, head_map)
		elif test_text == '':
			mprint("End of test input", out, 'err')
			break

		mprint("Sentence %d:" % sent_no, out, 'all')

		gold_text = gold_text.strip()
		test_text = test_text.strip()
		if len(gold_text) == 0:
			mprint("No gold tree", out, 'all')
			continue
		elif len(test_text) == 0:
			mprint("Not parsed", out, 'all')
			continue

		gold_complete_tree = pstree.tree_from_text(gold_text)
		treebanks.ptb_cleaning(gold_complete_tree)
		gold_tree = treebanks.apply_collins_rules(gold_complete_tree, False)
		if gold_tree is None:
			mprint("Empty gold tree", out, 'all')
			mprint(gold_complete_tree.__repr__(), out, 'all')
			mprint(gold_tree.__repr__(), out, 'all')
			continue

		if '()' in test_text:
			mprint("() test tree", out, 'all')
			continue
		test_complete_tree = pstree.tree_from_text(test_text)
		treebanks.ptb_cleaning(test_complete_tree)
		test_tree = treebanks.apply_collins_rules(test_complete_tree, False)
		if test_tree is None:
示例#7
0
def compute_overall_score(gold_file, test_file):
    gold_in = open(gold_file).readlines()
    test_in = open(test_file).readlines()
    stats = {'out_evalb': [0, 0, 0], 'out_relaxed': [0, 0, 0]}

    assert len(gold_in) == len(test_in)

    for i in range(len(gold_in)):
        print "Sent: " + str(i)
        gold_text = gold_in[i]
        test_text = test_in[i]
        if gold_text == '' and test_text == '':
            break
        elif gold_text == '':
            break
        elif test_text == '':
            break

        gold_text = gold_text.strip()
        test_text = test_text.strip()
        if len(gold_text) == 0:
            continue
        elif len(test_text) == 0:
            continue

        gold_complete_tree = pstree.tree_from_text(gold_text,
                                                   allow_empty_labels=True)
        gold_complete_tree = treebanks.homogenise_tree(gold_complete_tree)
        treebanks.ptb_cleaning(gold_complete_tree)
        gold_tree = gold_complete_tree
        #gold_tree = treebanks.apply_collins_rules(gold_complete_tree, False)

        test_complete_tree = pstree.tree_from_text(test_text,
                                                   allow_empty_labels=True)
        test_complete_tree = treebanks.homogenise_tree(test_complete_tree)
        treebanks.ptb_cleaning(test_complete_tree)
        test_tree = test_complete_tree
        #test_tree = treebanks.apply_collins_rules(test_complete_tree, False)

        gold_words = gold_tree.word_yield()
        test_words = test_tree.word_yield()
        if len(test_words.split()) != len(gold_words.split()):
            print "Sentence lengths do not match in sentence..." + str(i)
            print "Gold: " + gold_words.__repr__()
            print "Test: " + test_words.__repr__()

        match_strict, gold_strict, test_strict, _, _ = relaxed_parse_errors.counts_for_prf(
            test_tree, gold_tree)
        match_relaxed, gold_relaxed, test_relaxed, _, _ = relaxed_parse_errors.relaxed_counts_for_prf(
            test_tree, gold_tree)
        stats['out_evalb'][0] += match_strict
        stats['out_evalb'][1] += gold_strict
        stats['out_evalb'][2] += test_strict
        p, r, f = nlp_eval.calc_prf(match_strict, gold_strict, test_strict)
        print "Eval--Strict Evalb: %.2f  %.2f  %.2f" % (p * 100, r * 100,
                                                        f * 100)

        stats['out_relaxed'][0] += match_relaxed
        stats['out_relaxed'][1] += gold_relaxed
        stats['out_relaxed'][2] += test_relaxed
        p, r, f = nlp_eval.calc_prf(match_relaxed, gold_relaxed, test_relaxed)
        print "Eval--Relaxed Edit: %.2f  %.2f  %.2f" % (p * 100, r * 100,
                                                        f * 100)

    match = stats['out_evalb'][0]
    gold = stats['out_evalb'][1]
    test = stats['out_evalb'][2]
    p, r, f = nlp_eval.calc_prf(match, gold, test)
    print "Overall--Standard EVALB %s: %.2f  %.2f  %.2f" % ('out', p * 100,
                                                            r * 100, f * 100)

    match = stats['out_relaxed'][0]
    gold = stats['out_relaxed'][1]
    test = stats['out_relaxed'][2]
    p, r, f = nlp_eval.calc_prf(match, gold, test)
    print "Overall--Relaxed EDIT %s: %.2f  %.2f  %.2f" % ('out', p * 100,
                                                          r * 100, f * 100)