Пример #1
0
def test_punct():
    """Verify that punctuation movement does not increase fan-out."""
    def phrasal(x):
        return x and isinstance(x[0], Tree)

    from discodop.treebank import NegraCorpusReader
    filename = 'alpinosample.export'
    mangledtrees = NegraCorpusReader(filename, punct='move')
    nopunct = list(
        NegraCorpusReader(filename, punct='remove').trees().values())
    originals = list(
        NegraCorpusReader(filename, headrules=None,
                          encoding='iso-8859-1').trees().values())
    for n, mangled, sent, nopunct, original in zip(
            count(),
            mangledtrees.trees().values(),
            mangledtrees.sents().values(), nopunct, originals):
        print(n, end='. ')
        for a, b in zip(
                sorted(addbitsets(mangled).subtrees(phrasal),
                       key=lambda n: min(n.leaves())),
                sorted(addbitsets(nopunct).subtrees(phrasal),
                       key=lambda n: min(n.leaves()))):
            if fanout(a) != fanout(b):
                print(' '.join(sent))
                print(mangled)
                print(nopunct)
                print(original)
            assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % (fanout(a),
                                                              fanout(b), a, b)
    print()
Пример #2
0
def test_optimalbinarize():
	"""Verify that all optimal parsing complexities are lower than or
	equal to the complexities of right-to-left binarizations."""
	from discodop.treetransforms import optimalbinarize, complexityfanout
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, (tree, sent) in enumerate(zip(list(
			corpus.trees().values())[:-2000], corpus.sents().values())):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('opt. bin. violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Пример #3
0
def test_transforms():
    """Test reversibility of Tiger transformations."""
    from discodop.treebanktransforms import transform, reversetransform, \
      bracketings
    from discodop.treebank import NegraCorpusReader, handlefunctions
    headrules = None  # 'alpino.headrules'
    n = NegraCorpusReader('alpinosample.export', headrules=headrules)
    nn = NegraCorpusReader('alpinosample.export', headrules=headrules)
    transformations = ('S-RC', 'VP-GF', 'NP')
    trees = [
        transform(tree, sent, transformations)
        for tree, sent in zip(nn.trees().values(),
                              nn.sents().values())
    ]
    print('\ntransformed')
    correct = exact = e = 0
    for a, b, c, d in islice(
            zip(n.trees().values(),
                n.sents().values(), trees, count()), 100):
        transformc = reversetransform(c.copy(True), b, transformations)
        c1 = bracketings(canonicalize(a))
        c2 = bracketings(canonicalize(transformc))
        z = -1  # 825
        if c1 != c2 or e == z:
            precision = len(set(c1) & set(c2)) / len(set(c1))
            recall = len(set(c1) & set(c2)) / len(set(c2))
            if precision != 1.0 or recall != 1.0 or d == z:
                print(
                    d, ' '.join(':'.join((str(n), a.encode('unicode-escape')))
                                for n, a in enumerate(b)))
                print('no match', precision, recall)
                print(len(c1), len(c2), 'gold-transformed',
                      set(c2) - set(c1), 'transformed-gold',
                      set(c1) - set(c2))
                print(a)
                print(transformc)
                handlefunctions('add', a)
                print(a, '\n', b, '\n\n')
            else:
                correct += 1
        else:
            exact += 1
            correct += 1
        e += 1
    print('matches', correct, '/', e, 100 * correct / e, '%')
    print('exact', exact)
Пример #4
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers
    from discodop.disambiguation import getderivations, marginalize
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, _backtransform, _, _ = doubledop(trees,
                                               sents,
                                               debug=False,
                                               numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(None,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    result, msg = grammar.testgrammar()
    assert result, 'RFE should sum to 1.\n%s' % msg
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print('sentence:',
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            getderivations(chart, 100)
            _parses, _msg = marginalize('mpp', chart)
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Пример #5
0
 def test_transform(self):
     from discodop.treebanktransforms import transform, reversetransform, \
       bracketings
     from discodop.treebank import NegraCorpusReader
     n = NegraCorpusReader('alpinosample.export')
     for transformations in (('FUNC-NODE', ), ('MORPH-NODE', ),
                             ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE',
                                                'LEMMA-NODE')):
         nn = NegraCorpusReader('alpinosample.export')
         trees = [
             transform(tree, sent, transformations)
             for tree, sent in zip(nn.trees().values(),
                                   nn.sents().values())
         ]
         for a, b in islice(zip(n.trees().values(), trees), 100):
             before = bracketings(canonicalize(a))
             transformb = reversetransform(b.copy(True), transformations)
             after = bracketings(canonicalize(transformb))
             assert before == after, (
                 'mismatch with %r\nbefore: %r\nafter: %r' %
                 (transformations, before, after))
Пример #6
0
def test_splitdisc():
	"""Verify that splitting and merging discontinuities gives the same
	trees."""
	from discodop.treebank import NegraCorpusReader
	correct = wrong = 0
	corpus = NegraCorpusReader('alpinosample.export')
	for tree in corpus.trees().values():
		if mergediscnodes(splitdiscnodes(tree)) == tree:
			correct += 1
		else:
			wrong += 1
	total = len(corpus.sents())
	print('disc. split-merge: correct', correct, '=', 100. * correct / total, '%')
	print('disc. split-merge: wrong', wrong, '=', 100. * wrong / total, '%')
	assert wrong == 0
Пример #7
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers
    from discodop.disambiguation import recoverfragments
    from discodop.kbest import lazykbest
    from math import exp
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, backtransform, _, _ = doubledop(trees,
                                              sents,
                                              debug=False,
                                              numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(grammar,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    assert grammar.testgrammar()[0], "RFE should sum to 1."
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print("sentence:",
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            mpp, parsetrees = {}, {}
            derivations, _ = lazykbest(chart, 1000, '}<')
            for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
                r = Tree(recoverfragments(d.key, chart, backtransform))
                r = str(removefanoutmarkers(unbinarize(r)))
                mpp[r] = mpp.get(r, 0.0) + exp(-p)
                parsetrees.setdefault(r, []).append((t, p))
            if debug:
                print(len(mpp), 'parsetrees',
                      sum(map(len, parsetrees.values())), 'derivations')
            for t, tp in sorted(mpp.items(), key=itemgetter(1)):
                if debug:
                    print(tp, t, '\nmatch:', t == str(tree))
                if len(set(parsetrees[t])) != len(parsetrees[t]):
                    print('chart:\n', chart)
                    assert len(set(parsetrees[t])) == len(parsetrees[t])
                if debug:
                    for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
                        print(' <= %6g %s' % (exp(-p), deriv))
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Пример #8
0
"""https://gist.github.com/andreasvc/7507135#file-tigersplit-py"""
""" The train-test split described in Hall & Nivre (2008),
Parsing Discontinuous Phrase Structure with Grammatical Functions.

Corpus is divided in Sections 0-9, where sentence i is allocated to section i mod 10.
For development train on sections 2-9; evaluate on section 1.
For final evaluation (test) train on sections 1-9; evaluate on section 0.
"""
import io
import os
from discodop.treebank import NegraCorpusReader

#corpus = NegraCorpusReader('tiger/corpus', 'tiger_release_aug07.export',
		#encoding='iso-8859-1')

corpus = NegraCorpusReader('tiger21/corpus/tiger_release_aug07.export',
		encoding='iso-8859-1')

#os.mkdir('tiger-split/')
io.open('tiger21/tigertraindev.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 > 1)
io.open('tiger21/tigerdev.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 == 1)

io.open('tiger21/tigertraintest.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 != 0)
io.open('tiger21/tigertest.export', 'w', encoding='utf8').writelines(
		a for n, a in enumerate(corpus.blocks().values(), 1)
		if n % 10 == 0)