Пример #1
0
def testpunct():
	""" Verify that punctuation movement does not increase fan-out. """
	from discodop.treetransforms import addbitsets, fanout
	from discodop.treebank import NegraCorpusReader
	filename = 'alpinosample.export'
	mangledtrees = NegraCorpusReader('.', filename, punct='move')
	nopunct = list(NegraCorpusReader('.', filename,
			punct='remove').parsed_sents().values())
	originals = list(NegraCorpusReader('.', filename, headrules=None,
			encoding='iso-8859-1').parsed_sents().values())
	phrasal = lambda x: len(x) and isinstance(x[0], Tree)
	for n, mangled, sent, nopunct, original in zip(count(),
			mangledtrees.parsed_sents().values(),
			mangledtrees.sents().values(), nopunct, originals):
		print(n, end='')
		for a, b in zip(sorted(addbitsets(mangled).subtrees(phrasal),
				key=lambda n: min(n.leaves())),
				sorted(addbitsets(nopunct).subtrees(phrasal),
				key=lambda n: min(n.leaves()))):
			if fanout(a) != fanout(b):
				print(' '.join(sent))
				print(mangled)
				print(nopunct)
				print(original)
			assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % (
				fanout(a), fanout(b), a, b)
	print()
Пример #2
0
def testminbin():
	""" Verify that all optimal parsing complexities are lower than or equal
	to the complexities of right-to-left binarizations. """
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('.', 'alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, tree, sent in zip(count(), list(
			corpus.parsed_sents().values())[:-2000], corpus.sents().values()):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Пример #3
0
def testtransforms():
	""" Test whether the Tiger transformations (transform / reversetransform)
	are reversible. """
	from discodop.treetransforms import canonicalize
	from discodop.treebank import NegraCorpusReader, handlefunctions
	headrules = None  # 'alpino.headrules'
	n = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules)
	nn = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules)
	transformations = ('S-RC', 'VP-GF', 'NP')
	trees = [transform(tree, sent, transformations)
			for tree, sent in zip(nn.parsed_sents().values(),
				nn.sents().values())]
	print('\ntransformed')
	correct = exact = d = 0
	for a, b, c in islice(zip(n.parsed_sents().values(),
			trees, n.sents().values()), 100):
		transformb = reversetransform(b.copy(True), transformations)
		b1 = bracketings(canonicalize(a))
		b2 = bracketings(canonicalize(transformb))
		z = -1  # 825
		if b1 != b2 or d == z:
			precision = len(set(b1) & set(b2)) / len(set(b1))
			recall = len(set(b1) & set(b2)) / len(set(b2))
			if precision != 1.0 or recall != 1.0 or d == z:
				print(d, ' '.join(':'.join((str(n),
					a.encode('unicode-escape'))) for n, a in enumerate(c)))
				print('no match', precision, recall)
				print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1),
						'transformed-gold', set(b1) - set(b2))
				print(a)
				print(transformb)
				handlefunctions('add', a)
				print(a)
				print(b)
				print()
			else:
				correct += 1
		else:
			exact += 1
			correct += 1
		d += 1
	print('matches', correct, '/', d, 100 * correct / d, '%')
	print('exact', exact)
Пример #4
0
def testsplit():
	""" Verify that splitting and merging discontinuties gives the
	same trees for a treebank. """
	from discodop.treebank import NegraCorpusReader
	correct = wrong = 0
	n = NegraCorpusReader('.', 'alpinosample.export')
	for tree in n.parsed_sents().values():
		if mergediscnodes(splitdiscnodes(tree)) == tree:
			correct += 1
		else:
			wrong += 1
	total = len(n.sents())
	print("correct", correct, "=", 100 * correct / total, "%")
	print("wrong", wrong, "=", 100 * wrong / total, "%")
Пример #5
0
def test():
	""" Run some tests. """
	from discodop import plcfrs
	from discodop.containers import Grammar
	from discodop.treebank import NegraCorpusReader
	from discodop.treetransforms import binarize, unbinarize, \
			addfanoutmarkers, removefanoutmarkers
	from discodop.disambiguation import recoverfragments
	from discodop.kbest import lazykbest
	from discodop.fragments import getfragments
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	filename = "alpinosample.export"
	corpus = NegraCorpusReader('.', filename, punct='move')
	sents = list(corpus.sents().values())
	trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
			for a in list(corpus.parsed_sents().values())[:10]]

	print('plcfrs')
	lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label)
	print(lcfrs)

	print('dop reduction')
	grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
			start=trees[0].label)
	print(grammar)
	grammar.testgrammar()

	fragments = getfragments(trees, sents, 1)
	debug = '--debug' in sys.argv
	grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug)
	print('\ndouble dop grammar')
	grammar = Grammar(grammarx, start=trees[0].label)
	grammar.getmapping(grammar, striplabelre=None,
			neverblockre=re.compile(b'^#[0-9]+|.+}<'),
			splitprune=False, markorigin=False)
	print(grammar)
	assert grammar.testgrammar(), "DOP1 should sum to 1."
	for tree, sent in zip(corpus.parsed_sents().values(), sents):
		print("sentence:", ' '.join(a.encode('unicode-escape').decode()
				for a in sent))
		chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
		print('\n', msg, end='')
		print("\ngold ", tree)
		print("double dop", end='')
		if chart:
			mpp = {}
			parsetrees = {}
			derivations, _ = lazykbest(chart, 1000, b'}<')
			for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
				r = Tree(recoverfragments(d.getkey(), chart,
					grammar, backtransform))
				r = str(removefanoutmarkers(unbinarize(r)))
				mpp[r] = mpp.get(r, 0.0) + exp(-p)
				parsetrees.setdefault(r, []).append((t, p))
			print(len(mpp), 'parsetrees', end='')
			print(sum(map(len, parsetrees.values())), 'derivations')
			for t, tp in sorted(mpp.items(), key=itemgetter(1)):
				print(tp, '\n', t, end='')
				print("match:", t == str(tree))
				assert len(set(parsetrees[t])) == len(parsetrees[t])
				if not debug:
					continue
				for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
					print(' <= %6g %s' % (exp(-p), deriv))
		else:
			print("no parse")
			print(chart)
		print()
	tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
	Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))