def testpunct(): """ Verify that punctuation movement does not increase fan-out. """ from discodop.treetransforms import addbitsets, fanout from discodop.treebank import NegraCorpusReader filename = 'alpinosample.export' mangledtrees = NegraCorpusReader('.', filename, punct='move') nopunct = list(NegraCorpusReader('.', filename, punct='remove').parsed_sents().values()) originals = list(NegraCorpusReader('.', filename, headrules=None, encoding='iso-8859-1').parsed_sents().values()) phrasal = lambda x: len(x) and isinstance(x[0], Tree) for n, mangled, sent, nopunct, original in zip(count(), mangledtrees.parsed_sents().values(), mangledtrees.sents().values(), nopunct, originals): print(n, end='') for a, b in zip(sorted(addbitsets(mangled).subtrees(phrasal), key=lambda n: min(n.leaves())), sorted(addbitsets(nopunct).subtrees(phrasal), key=lambda n: min(n.leaves()))): if fanout(a) != fanout(b): print(' '.join(sent)) print(mangled) print(nopunct) print(original) assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % ( fanout(a), fanout(b), a, b) print()
def testminbin(): """ Verify that all optimal parsing complexities are lower than or equal to the complexities of right-to-left binarizations. """ from discodop.treebank import NegraCorpusReader corpus = NegraCorpusReader('.', 'alpinosample.export', punct='move') total = violations = violationshd = 0 for n, tree, sent in zip(count(), list( corpus.parsed_sents().values())[:-2000], corpus.sents().values()): t = addbitsets(tree) if all(fanout(x) == 1 for x in t.subtrees()): continue print(n, tree, '\n', ' '.join(sent)) total += 1 optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1) # undo head-ordering to get a normal right-to-left binarization normbin = addbitsets(binarize(canonicalize(Tree.convert(tree)))) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('non-hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violations += 1 optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1) normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1)) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violationshd += 1 print('violations normal: %d / %d; hd: %d / %d' % ( violations, total, violationshd, total)) assert violations == violationshd == 0
def testtransforms(): """ Test whether the Tiger transformations (transform / reversetransform) are reversible. """ from discodop.treetransforms import canonicalize from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules) nn = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.parsed_sents().values(), nn.sents().values())] print('\ntransformed') correct = exact = d = 0 for a, b, c in islice(zip(n.parsed_sents().values(), trees, n.sents().values()), 100): transformb = reversetransform(b.copy(True), transformations) b1 = bracketings(canonicalize(a)) b2 = bracketings(canonicalize(transformb)) z = -1 # 825 if b1 != b2 or d == z: precision = len(set(b1) & set(b2)) / len(set(b1)) recall = len(set(b1) & set(b2)) / len(set(b2)) if precision != 1.0 or recall != 1.0 or d == z: print(d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(c))) print('no match', precision, recall) print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1), 'transformed-gold', set(b1) - set(b2)) print(a) print(transformb) handlefunctions('add', a) print(a) print(b) print() else: correct += 1 else: exact += 1 correct += 1 d += 1 print('matches', correct, '/', d, 100 * correct / d, '%') print('exact', exact)
def testsplit(): """ Verify that splitting and merging discontinuties gives the same trees for a treebank. """ from discodop.treebank import NegraCorpusReader correct = wrong = 0 n = NegraCorpusReader('.', 'alpinosample.export') for tree in n.parsed_sents().values(): if mergediscnodes(splitdiscnodes(tree)) == tree: correct += 1 else: wrong += 1 total = len(n.sents()) print("correct", correct, "=", 100 * correct / total, "%") print("wrong", wrong, "=", 100 * wrong / total, "%")
def test(): """ Run some tests. """ from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import binarize, unbinarize, \ addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from discodop.fragments import getfragments logging.basicConfig(level=logging.DEBUG, format='%(message)s') filename = "alpinosample.export" corpus = NegraCorpusReader('.', filename, punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.parsed_sents().values())[:10]] print('plcfrs') lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label) print(lcfrs) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) print(grammar) grammar.testgrammar() fragments = getfragments(trees, sents, 1) debug = '--debug' in sys.argv grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug) print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) print(grammar) assert grammar.testgrammar(), "DOP1 should sum to 1." for tree, sent in zip(corpus.parsed_sents().values(), sents): print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) print('\n', msg, end='') print("\ngold ", tree) print("double dop", end='') if chart: mpp = {} parsetrees = {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.getkey(), chart, grammar, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) print(len(mpp), 'parsetrees', end='') print(sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): print(tp, '\n', t, end='') print("match:", t == str(tree)) assert len(set(parsetrees[t])) == len(parsetrees[t]) if not debug: continue for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) else: print("no parse") print(chart) print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))