示例#1
0
def loadtraincorpus(corpusfmt, traincorpus, binarization, punct, functions,
		morphology, removeempty, ensureroot, transformations,
		relationalrealizational):
	"""Load the training corpus."""
	train = treebank.READERS[corpusfmt](traincorpus.path,
			encoding=traincorpus.encoding, headrules=binarization.headrules,
			headfinal=True, headreverse=False, removeempty=removeempty,
			ensureroot=ensureroot, punct=punct,
			functions=functions, morphology=morphology)
	if isinstance(traincorpus.numsents, float):
		traincorpus.numsents = int(traincorpus.numsents * len(train.sents()))
	traintrees = train.itertrees(None, traincorpus.numsents)
	trees, sents = zip(*[treesent for _, treesent in traintrees
			if 1 <= len(treesent[1]) <= traincorpus.maxwords])
	logging.info('%d training sentences after length restriction <= %d',
			len(trees), traincorpus.maxwords)
	if not trees:
		raise ValueError('training corpus (selection) should be non-empty.')
	if transformations:
		trees = [treebanktransforms.transform(tree, sent, transformations)
				for tree, sent in zip(trees, sents)]
	if relationalrealizational:
		trees = [treebanktransforms.rrtransform(
				tree, **relationalrealizational)[0] for tree in trees]
	train_tagged_sents = [[(word, tag) for word, (_, tag)
			in zip(sent, sorted(tree.pos()))]
				for tree, sent in zip(trees, sents)]
	return trees, sents, train_tagged_sents
示例#2
0
def test_transforms():
    """Test reversibility of Tiger transformations."""
    from discodop.treebanktransforms import transform, reversetransform, \
      bracketings
    from discodop.treebank import NegraCorpusReader, handlefunctions
    headrules = None  # 'alpino.headrules'
    n = NegraCorpusReader('alpinosample.export', headrules=headrules)
    nn = NegraCorpusReader('alpinosample.export', headrules=headrules)
    transformations = ('S-RC', 'VP-GF', 'NP')
    trees = [
        transform(tree, sent, transformations)
        for tree, sent in zip(nn.trees().values(),
                              nn.sents().values())
    ]
    print('\ntransformed')
    correct = exact = e = 0
    for a, b, c, d in islice(
            zip(n.trees().values(),
                n.sents().values(), trees, count()), 100):
        transformc = reversetransform(c.copy(True), b, transformations)
        c1 = bracketings(canonicalize(a))
        c2 = bracketings(canonicalize(transformc))
        z = -1  # 825
        if c1 != c2 or e == z:
            precision = len(set(c1) & set(c2)) / len(set(c1))
            recall = len(set(c1) & set(c2)) / len(set(c2))
            if precision != 1.0 or recall != 1.0 or d == z:
                print(
                    d, ' '.join(':'.join((str(n), a.encode('unicode-escape')))
                                for n, a in enumerate(b)))
                print('no match', precision, recall)
                print(len(c1), len(c2), 'gold-transformed',
                      set(c2) - set(c1), 'transformed-gold',
                      set(c1) - set(c2))
                print(a)
                print(transformc)
                handlefunctions('add', a)
                print(a, '\n', b, '\n\n')
            else:
                correct += 1
        else:
            exact += 1
            correct += 1
        e += 1
    print('matches', correct, '/', e, 100 * correct / e, '%')
    print('exact', exact)
示例#3
0
def test_transforms():
	"""Test reversibility of Tiger transformations."""
	from discodop.treebanktransforms import transform, reversetransform, \
			bracketings
	from discodop.treebank import NegraCorpusReader, handlefunctions
	headrules = None  # 'alpino.headrules'
	n = NegraCorpusReader('alpinosample.export', headrules=headrules)
	nn = NegraCorpusReader('alpinosample.export', headrules=headrules)
	transformations = ('S-RC', 'VP-GF', 'NP')
	trees = [transform(tree, sent, transformations)
			for tree, sent in zip(nn.trees().values(),
				nn.sents().values())]
	print('\ntransformed')
	correct = exact = d = 0
	for a, b, c in islice(zip(n.trees().values(),
			trees, n.sents().values()), 100):
		transformb = reversetransform(b.copy(True), transformations)
		b1 = bracketings(canonicalize(a))
		b2 = bracketings(canonicalize(transformb))
		z = -1  # 825
		if b1 != b2 or d == z:
			precision = len(set(b1) & set(b2)) / len(set(b1))
			recall = len(set(b1) & set(b2)) / len(set(b2))
			if precision != 1.0 or recall != 1.0 or d == z:
				print(d, ' '.join(':'.join((str(n),
					a.encode('unicode-escape'))) for n, a in enumerate(c)))
				print('no match', precision, recall)
				print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1),
						'transformed-gold', set(b1) - set(b2))
				print(a)
				print(transformb)
				handlefunctions('add', a)
				print(a, '\n', b, '\n\n')
			else:
				correct += 1
		else:
			exact += 1
			correct += 1
		d += 1
	print('matches', correct, '/', d, 100 * correct / d, '%')
	print('exact', exact)
示例#4
0
 def test_transform(self):
     from discodop.treebanktransforms import transform, reversetransform, \
       bracketings
     from discodop.treebank import NegraCorpusReader
     n = NegraCorpusReader('alpinosample.export')
     for transformations in (('FUNC-NODE', ), ('MORPH-NODE', ),
                             ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE',
                                                'LEMMA-NODE')):
         nn = NegraCorpusReader('alpinosample.export')
         trees = [
             transform(tree, sent, transformations)
             for tree, sent in zip(nn.trees().values(),
                                   nn.sents().values())
         ]
         for a, b in islice(zip(n.trees().values(), trees), 100):
             before = bracketings(canonicalize(a))
             transformb = reversetransform(b.copy(True), transformations)
             after = bracketings(canonicalize(transformb))
             assert before == after, (
                 'mismatch with %r\nbefore: %r\nafter: %r' %
                 (transformations, before, after))
示例#5
0
	def test_transform(self):
		from discodop.treebanktransforms import transform, reversetransform, \
				bracketings
		from discodop.treebank import NegraCorpusReader
		n = NegraCorpusReader('alpinosample.export')
		for transformations in (
				('FUNC-NODE', ),
				('MORPH-NODE', ),
				('LEMMA-NODE', ),
				('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')):
			nn = NegraCorpusReader('alpinosample.export')
			trees = [transform(tree, sent, transformations)
					for tree, sent in zip(nn.trees().values(),
						nn.sents().values())]
			for a, b in islice(zip(n.trees().values(), trees), 100):
				before = bracketings(canonicalize(a))
				transformb = reversetransform(b.copy(True), transformations)
				after = bracketings(canonicalize(transformb))
				assert before == after, (
						'mismatch with %r\nbefore: %r\nafter: %r' % (
						transformations, before, after))
示例#6
0
def main():
	"""Command line interface for applying tree(bank) transforms."""
	import io
	from getopt import gnu_getopt, GetoptError
	from discodop import treebanktransforms
	actions = {'none': None, 'introducepreterminals': introducepreterminals,
			'splitdisc': None, 'mergedisc': mergediscnodes, 'transform': None,
			'unbinarize': unbinarize, 'binarize': None, 'optimalbinarize': None}
	flags = ('markorigin markheads leftunary rightunary tailmarker '
			'renumber reverse'.split())
	options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= '
			'punct= headrules= functions= morphology= lemmas= factor= '
			'markorigin= maxlen= fmt= enc= transforms=').split()
	try:
		opts, args = gnu_getopt(sys.argv[1:], 'h:v:', flags + options)
		if not 1 <= len(args) <= 3:
			raise GetoptError('error: expected 1, 2, or 3 positional arguments')
	except GetoptError as err:
		print('error: %r\n%s' % (err, USAGE), file=sys.stderr)
		sys.exit(2)
	opts, action = dict(opts), args[0]
	if action not in actions:
		print('unrecognized action: %r\navailable actions: %s' % (
				action, ', '.join(actions)), file=sys.stderr)
		sys.exit(2)
	if '--fmt' in opts:
		opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt']
	if '--enc' in opts:
		opts['--inputenc'] = opts['--outputenc'] = opts['--enc']
	if opts.get('--outputfmt', WRITERS[0]) not in WRITERS:
		print('unrecognized output format: %r\navailable formats: %s' % (
				opts.get('--outputfmt'), ' '.join(WRITERS)), file=sys.stderr)
		sys.exit(2)
	infilename = args[1] if len(args) >= 2 and args[1] != '-' else '/dev/stdin'
	outfilename = args[2] if len(args) == 3 and args[2] != '-' else '/dev/stdout'

	# open corpus
	corpus = READERS[opts.get('--inputfmt', 'export')](
			infilename,
			encoding=opts.get('--inputenc', 'utf-8'),
			headrules=opts.get('--headrules'), markheads='--markheads' in opts,
			ensureroot=opts.get('--ensureroot'), punct=opts.get('--punct'),
			functions=opts.get('--functions'),
			morphology=opts.get('--morphology'),
			lemmas=opts.get('--lemmas'))
	start, end = opts.get('--slice', ':').split(':')
	start, end = (int(start) if start else None), (int(end) if end else None)
	trees = corpus.itertrees(start, end)
	if '--maxlen' in opts:
		maxlen = int(opts['--maxlen'])
		trees = ((key, (tree, sent)) for key, (tree, sent) in trees
				if len(sent) <= maxlen)
	if '--renumber' in opts:
		trees = (('%8d' % n, treesent)
				for n, (_, treesent) in enumerate(trees, 1))

	# select transformation
	transform = actions[action]
	if action in ('binarize', 'optimalbinarize'):
		h = int(opts.get('-h', 999))
		v = int(opts.get('-v', 1))
		if action == 'binarize':
			factor = opts.get('--factor', 'right')
			transform = lambda t, _: binarize(t, factor, h, v,
					leftmostunary='--leftunary' in opts,
					rightmostunary='--rightunary' in opts,
					tailmarker='$' if '--tailmarker' in opts else '')
		elif action == 'optimalbinarize':
			headdriven = '--headrules' in opts
			transform = lambda t, _: optimalbinarize(t, '|', headdriven, h, v)
	elif action == 'splitdisc':
		transform = lambda t, _: splitdiscnodes(t, '--markorigin' in opts)
	elif action == 'unbinarize':
		transform = lambda t, _: unbinarize(Tree.convert(t))
	elif action == 'transform':
		tfs = opts['--transforms'].split(',')
		transform = lambda t, s: (treebanktransforms.reversetransform(t, tfs)
				if '--reverse' in opts
				else treebanktransforms.transform(t, s, tfs))
	if transform is not None:  # NB: transform cannot affect (no. of) terminals
		trees = ((key, (transform(tree, sent), sent)) for key, (tree, sent) in trees)

	# read, transform, & write trees
	headrules = None
	if opts.get('--outputfmt') in ('mst', 'conll'):
		if not opts.get('--headrules'):
			raise ValueError('need head rules for dependency conversion')
		headrules = treebanktransforms.readheadrules(opts.get('--headrules'))
	cnt = 0
	if opts.get('--outputfmt') == 'dact':
		import alpinocorpus
		outfile = alpinocorpus.CorpusWriter(outfilename)
		if (action == 'none' and opts.get('--inputfmt') in ('alpino', 'dact')
				and set(opts) <= {'--slice', '--inputfmt', '--outputfmt',
				'--renumber'}):
			for n, (key, block) in islice(enumerate(
					corpus.blocks().items(), 1), start, end):
				outfile.write('%8d' % n if '--renumber' in opts else key, block)
				cnt += 1
		else:
			for key, (tree, sent) in trees:
				outfile.write(str(key), writetree(tree, sent, key, 'alpino'))
				cnt += 1
	else:
		encoding = opts.get('outputenc', 'utf-8')
		outfile = io.open(outfilename, 'w', encoding=encoding)
		# copy trees verbatim when only taking slice or converting encoding
		if (action == 'none' and opts.get('--inputfmt') == opts.get(
				'--outputfmt') and set(opts) <= {'--slice', '--inputenc',
				'--outputenc', '--inputfmt', '--outputfmt'}):
			for block in islice(corpus.blocks().values(), start, end):
				outfile.write(block)
				cnt += 1
		else:
			for key, (tree, sent) in trees:
				outfile.write(writetree(tree, sent, key,
						opts.get('--outputfmt', 'export'), headrules))
				cnt += 1
	print('%sed %d trees with action %r' % ('convert' if action == 'none'
			else 'transform', cnt, action), file=sys.stderr)
示例#7
0
def parsetepacoc(
		stages=(dict(mode='pcfg', split=True, markorigin=True),
				dict(mode='plcfrs', prune=True, k=10000, splitprune=True),
				dict(mode='plcfrs', prune=True, k=5000, dop=True,
					usedoubledop=True, estimator='dop1', objective='mpp',
					sample=False, kbest=True)),
		trainmaxwords=999, trainnumsents=25005, testmaxwords=999,
		bintype='binarize', h=1, v=1, factor='right', tailmarker='',
		markhead=False, revmarkov=False, pospa=False,
		leftmostunary=True, rightmostunary=True,
		fanout_marks_before_bin=False, transformations=None,
		usetagger='stanford', resultdir='tepacoc', numproc=1):
	""" Parse the tepacoc test set. """
	for stage in stages:
		for key in stage:
			assert key in DEFAULTSTAGE, 'unrecognized option: %r' % key
	stages = [DictObj({k: stage.get(k, v) for k, v in DEFAULTSTAGE.items()})
			for stage in stages]
	os.mkdir(resultdir)
	# Log everything, and send it to stderr, in a format with just the message.
	formatstr = '%(message)s'
	logging.basicConfig(level=logging.DEBUG, format=formatstr)
	# log up to INFO to a results log file
	fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
	fileobj.setLevel(logging.INFO)
	fileobj.setFormatter(logging.Formatter(formatstr))
	logging.getLogger('').addHandler(fileobj)
	tepacocids, tepacocsents = readtepacoc()
	try:
		(corpus_sents, corpus_taggedsents,
				corpus_trees, corpus_blocks) = pickle.load(
					gzip.open('tiger.pickle.gz', 'rb'))
	except IOError:  # file not found
		corpus = getreader('export')('../tiger/corpus',
				'tiger_release_aug07.export',
				headrules='negra.headrules' if bintype == 'binarize' else None,
				headfinal=True, headreverse=False, punct='move',
				encoding='iso-8859-1')
		corpus_sents = list(corpus.sents().values())
		corpus_taggedsents = list(corpus.tagged_sents().values())
		corpus_trees = list(corpus.parsed_sents().values())
		if transformations:
			corpus_trees = [transform(tree, sent, transformations)
					for tree, sent in zip(corpus_trees, corpus_sents)]
		corpus_blocks = list(corpus.blocks().values())
		pickle.dump((corpus_sents, corpus_taggedsents, corpus_trees,
			corpus_blocks), gzip.open('tiger.pickle.gz', 'wb'), protocol=-1)

	# test sets (one for each category)
	testsets = {}
	allsents = []
	for cat, catsents in tepacocsents.items():
		testset = sents, trees, goldsents, blocks = [], [], [], []
		for n, sent in catsents:
			if sent != corpus_sents[n]:
				logging.error(
						'mismatch. sent %d:\n%r\n%r\n'
						'not in corpus %r\nnot in tepacoc %r',
						n + 1, sent, corpus_sents[n],
						[a for a, b in zip_longest(sent, corpus_sents[n])
							if a and a != b],
						[b for a, b in zip_longest(sent, corpus_sents[n])
							if b and a != b])
			elif len(corpus_sents[n]) <= testmaxwords:
				sents.append(corpus_taggedsents[n])
				trees.append(corpus_trees[n])
				goldsents.append(corpus_taggedsents[n])
				blocks.append(corpus_blocks[n])
		allsents.extend(sents)
		logging.info('category: %s, %d of %d sentences',
				cat, len(testset[0]), len(catsents))
		testsets[cat] = testset
	testsets['baseline'] = zip(*[sent for n, sent in
				enumerate(zip(corpus_taggedsents, corpus_trees,
						corpus_taggedsents, corpus_blocks))
				if len(sent[1]) <= trainmaxwords
				and n not in tepacocids][trainnumsents:trainnumsents + 2000])
	allsents.extend(testsets['baseline'][0])

	if usetagger:
		overridetags = ('PTKANT', 'VAIMP')
		taglex = defaultdict(set)
		for sent in corpus_taggedsents[:trainnumsents]:
			for word, tag in sent:
				taglex[word].add(tag)
		overridetagdict = {tag:
			{word for word, tags in taglex.items()
			if tags == {tag}} for tag in overridetags}
		tagmap = {'$(': '$[', 'PAV': 'PROAV', 'PIDAT': 'PIAT'}
		# the sentences in the list allsents are modified in-place so that
		# the relevant copy in testsets[cat][0] is updated as well.
		externaltagging(usetagger, '', allsents, overridetagdict, tagmap)

	# training set
	trees, sents, blocks = zip(*[sent for n, sent in
				enumerate(zip(corpus_trees, corpus_sents,
							corpus_blocks)) if len(sent[1]) <= trainmaxwords
							and n not in tepacocids][:trainnumsents])
	getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker,
			revmarkov, leftmostunary, rightmostunary, pospa, markhead,
			fanout_marks_before_bin, testmaxwords, resultdir,
			numproc, None, False, trees[0].label, None)
	del corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks
	results = {}
	cnt = 0
	parser = Parser(stages, tailmarker=tailmarker,
			transformations=transformations)
	for cat, testset in sorted(testsets.items()):
		if cat == 'baseline':
			continue
		logging.info('category: %s', cat)
		begin = time.clock()
		results[cat] = doparsing(parser=parser, testset=testset,
				resultdir=resultdir, usetags=True, numproc=numproc,
				category=cat)
		cnt += len(testset[0])
		if numproc == 1:
			logging.info('time elapsed during parsing: %g',
					time.clock() - begin)
		#else:  # wall clock time here
	goldbrackets = multiset()
	totresults = [DictObj(name=stage.name) for stage in stages]
	for result in totresults:
		result.elapsedtime = [None] * cnt
		result.parsetrees = [None] * cnt
		result.brackets = multiset()
		result.exact = result.noparse = 0
	goldblocks = []
	goldsents = []
	for cat, res in results.items():
		logging.info('category: %s', cat)
		goldbrackets |= res[2]
		goldblocks.extend(res[3])
		goldsents.extend(res[4])
		for result, totresult in zip(res[0], totresults):
			totresult.exact += result.exact
			totresult.noparse += result.noparse
			totresult.brackets |= result.brackets
			totresult.elapsedtime.extend(result.elapsedtime)
		oldeval(*res)
	logging.info('TOTAL')
	oldeval(totresults, goldbrackets)
	# write TOTAL results file with all tepacoc sentences (not the baseline)
	for stage in stages:
		open('TOTAL.%s.export' % stage.name, 'w').writelines(
				open('%s.%s.export' % (cat, stage.name)).read()
				for cat in list(results) + ['gold'])
	# do baseline separately because it shouldn't count towards the total score
	cat = 'baseline'
	logging.info('category: %s', cat)
	oldeval(*doparsing(parser=parser, testset=testsets[cat],
			resultdir=resultdir, usetags=True, numproc=numproc, category=cat))
示例#8
0
def startexp(
		stages=(DEFAULTSTAGE, ),  # see above
		corpusfmt='export',  # choices: export, discbracket, bracket
		corpusdir='.',
		# filenames may include globbing characters '*' and '?'.
		traincorpus='alpinosample.export', trainencoding='utf-8',
		testcorpus='alpinosample.export', testencoding='utf-8',
		testmaxwords=40,
		trainmaxwords=40,
		trainnumsents=2,
		testnumsents=1,  # number of sentences to parse
		skiptrain=True,  # test set starts after training set
		# (useful when they are in the same file)
		skip=0,  # number of sentences to skip from test corpus
		punct=None,  # choices: None, 'move', 'remove', 'root'
		functions=None,  # choices None, 'add', 'remove', 'replace'
		morphology=None,  # choices: None, 'add', 'replace', 'between'
		transformations=None,  # apply treebank transformations
		# postagging: pass None to use tags from treebank.
		postagging=None,
		relationalrealizational=None,  # do not apply RR-transform
		headrules=None,  # rules for finding heads of constituents
		bintype='binarize',  # choices: binarize, optimal, optimalhead
		factor='right',
		revmarkov=True,
		v=1,
		h=2,
		pospa=False,  # when v > 1, add parent annotation to POS tags?
		markhead=False,  # prepend head to siblings
		leftmostunary=True,  # start binarization with unary node
		rightmostunary=True,  # end binarization with unary node
		tailmarker='',  # with headrules, head is last node and can be marked
		fanout_marks_before_bin=False,
		evalparam='proper.prm',  # EVALB-style parameter file
		quiet=False, reallyquiet=False,  # quiet=no per sentence results
		numproc=1,  # increase to use multiple CPUs; None: use all CPUs.
		resultdir='results',
		rerun=False):
	""" Execute an experiment. """
	assert bintype in ('optimal', 'optimalhead', 'binarize')
	if postagging is not None:
		assert set(postagging).issubset({'method', 'model',
				'unknownthreshold', 'openclassthreshold', 'simplelexsmooth'})
		if postagging['method'] == 'unknownword':
			assert postagging['model'] in ('4', '6', 'base')
			assert postagging['unknownthreshold'] >= 1
			assert postagging['openclassthreshold'] >= 0
		else:
			assert postagging['method'] in ('treetagger', 'stanford')

	if rerun:
		assert os.path.exists(resultdir), (
				'Directory %r does not exist.'
				'--rerun requires a directory '
				'with the grammar(s) of a previous experiment.'
				% resultdir)
	else:
		assert not os.path.exists(resultdir), (
			'Directory %r exists.\n'
			'Use --rerun to parse with existing grammar '
			'and overwrite previous results.' % resultdir)
		os.mkdir(resultdir)

	# Log everything, and send it to stderr, in a format with just the message.
	formatstr = '%(message)s'
	if reallyquiet:
		logging.basicConfig(level=logging.WARNING, format=formatstr)
	elif quiet:
		logging.basicConfig(level=logging.INFO, format=formatstr)
	else:
		logging.basicConfig(level=logging.DEBUG, format=formatstr)

	# also log to a file
	fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
	#fileobj.setLevel(logging.INFO)
	fileobj.setLevel(logging.DEBUG)
	fileobj.setFormatter(logging.Formatter(formatstr))
	logging.getLogger('').addHandler(fileobj)

	corpusreader = getreader(corpusfmt)
	if not rerun:
		corpus = corpusreader(corpusdir, traincorpus, encoding=trainencoding,
				headrules=headrules, headfinal=True, headreverse=False,
				punct=punct, functions=functions, morphology=morphology)
		logging.info('%d sentences in training corpus %s/%s',
				len(corpus.parsed_sents()), corpusdir, traincorpus)
		if isinstance(trainnumsents, float):
			trainnumsents = int(trainnumsents * len(corpus.sents()))
		trees = list(corpus.parsed_sents().values())[:trainnumsents]
		sents = list(corpus.sents().values())[:trainnumsents]
		if transformations:
			trees = [transform(tree, sent, transformations)
					for tree, sent in zip(trees, sents)]
		if relationalrealizational:
			trees = [rrtransform(tree, **relationalrealizational)[0]
					for tree in trees]
		train_tagged_sents = [[(word, tag) for word, (_, tag)
				in zip(sent, sorted(tree.pos()))]
					for tree, sent in zip(trees, sents)]
		blocks = list(corpus.blocks().values())[:trainnumsents]
		assert trees, 'training corpus should be non-empty'
		logging.info('%d training sentences before length restriction',
				len(trees))
		trees, sents, blocks = zip(*[sent for sent in zip(trees, sents, blocks)
			if len(sent[1]) <= trainmaxwords])
		logging.info('%d training sentences after length restriction <= %d',
			len(trees), trainmaxwords)

	testset = corpusreader(corpusdir, testcorpus, encoding=testencoding,
			punct=punct, morphology=morphology, functions=functions)
	gold_sents = testset.tagged_sents()
	test_parsed_sents = testset.parsed_sents()
	if skiptrain:
		skip += trainnumsents
	logging.info('%d sentences in test corpus %s/%s',
			len(testset.parsed_sents()), corpusdir, testcorpus)
	logging.info('%d test sentences before length restriction',
			len(list(gold_sents)[skip:skip + testnumsents]))
	lexmodel = None
	test_tagged_sents = gold_sents
	if postagging and postagging['method'] in ('treetagger', 'stanford'):
		if postagging['method'] == 'treetagger':
			# these two tags are never given by tree-tagger,
			# so collect words whose tag needs to be overriden
			overridetags = ('PTKANT', 'PIDAT')
		elif postagging['method'] == 'stanford':
			overridetags = ('PTKANT', )
		taglex = defaultdict(set)
		for sent in train_tagged_sents:
			for word, tag in sent:
				taglex[word].add(tag)
		overridetagdict = {tag:
			{word for word, tags in taglex.items() if tags == {tag}}
			for tag in overridetags}
		tagmap = {'$(': '$[', 'PAV': 'PROAV'}
		sents_to_tag = OrderedDict((a, b) for a, b
				in islice(gold_sents.items(), skip, skip + testnumsents)
				if len(b) <= testmaxwords),
		test_tagged_sents = externaltagging(postagging['method'],
				postagging['model'], sents_to_tag, overridetagdict, tagmap)
		# give these tags to parser
		usetags = True
	elif postagging and postagging['method'] == 'unknownword' and not rerun:
		postagging['unknownwordfun'] = getunknownwordfun(postagging['model'])
		# get smoothed probalities for lexical productions
		lexresults, msg = getunknownwordmodel(
				train_tagged_sents, postagging['unknownwordfun'],
				postagging['unknownthreshold'],
				postagging['openclassthreshold'])
		logging.info(msg)
		simplelexsmooth = postagging['simplelexsmooth']
		if simplelexsmooth:
			lexmodel = lexresults[2:8]
		else:
			lexmodel, msg = getlexmodel(*lexresults)
			logging.info(msg)
		# NB: knownwords are all words in training set, lexicon is the subset
		# of words that are above the frequency threshold.
		# for training purposes we work with the subset, at test time we exploit
		# the full set of known words from the training set.
		sigs, knownwords, lexicon = lexresults[:3]
		postagging['sigs'], postagging['lexicon'] = sigs, knownwords
		# replace rare train words with signatures
		sents = replaceraretrainwords(train_tagged_sents,
				postagging['unknownwordfun'], lexicon)
		# make sure gold POS tags are not given to parser
		usetags = False
	elif postagging and postagging['method'] == 'unknownword' and rerun:
		usetags = False
	else:
		simplelexsmooth = False
		# give gold POS tags to parser
		usetags = True

	# 0: test sentences as they should be handed to the parser,
	# 1: gold trees for evaluation purposes
	# 2: gold sentence because test sentences may be mangled by unknown word
	#   model
	# 3: blocks from treebank file to reproduce the relevant part of the
	#   original treebank verbatim.
	testset = OrderedDict((a, (test_tagged_sents[a], test_parsed_sents[a],
			gold_sents[a], block)) for a, block
			in islice(testset.blocks().items(), skip, skip + testnumsents)
			if len(test_tagged_sents[a]) <= testmaxwords)
	assert test_tagged_sents, 'test corpus should be non-empty'
	logging.info('%d test sentences after length restriction <= %d',
			len(testset), testmaxwords)

	if rerun:
		trees = []
		sents = []
	toplabels = {tree.label for tree in trees} | {
			test_parsed_sents[n].label for n in testset}
	assert len(toplabels) == 1, 'expected unique ROOT label: %r' % toplabels
	top = toplabels.pop()

	if rerun:
		readgrammars(resultdir, stages, postagging, top)
	else:
		logging.info('read training & test corpus')
		getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker,
				revmarkov, leftmostunary, rightmostunary, pospa, markhead,
				fanout_marks_before_bin, testmaxwords, resultdir, numproc,
				lexmodel, simplelexsmooth, top, relationalrealizational)
	evalparam = evalmod.readparam(evalparam)
	evalparam['DEBUG'] = -1
	evalparam['CUTOFF_LEN'] = 40
	deletelabel = evalparam.get('DELETE_LABEL', ())
	deleteword = evalparam.get('DELETE_WORD', ())

	begin = time.clock()
	parser = Parser(stages, transformations=transformations,
			tailmarker=tailmarker, postagging=postagging if postagging
			and postagging['method'] == 'unknownword' else None,
			relationalrealizational=relationalrealizational)
	results = doparsing(parser=parser, testset=testset, resultdir=resultdir,
			usetags=usetags, numproc=numproc, deletelabel=deletelabel,
			deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology)
	if numproc == 1:
		logging.info('time elapsed during parsing: %gs', time.clock() - begin)
	for result in results[0]:
		nsent = len(result.parsetrees)
		header = (' ' + result.name.upper() + ' ').center(35, '=')
		evalsummary = evalmod.doeval(OrderedDict((a, b.copy(True))
				for a, b in test_parsed_sents.items()), gold_sents,
				result.parsetrees, test_tagged_sents if usetags else gold_sents,
				evalparam)
		coverage = 'coverage: %s = %6.2f' % (
				('%d / %d' % (nsent - result.noparse, nsent)).rjust(
				25 if any(len(a) > evalparam['CUTOFF_LEN']
				for a in gold_sents.values()) else 14),
				100.0 * (nsent - result.noparse) / nsent)
		logging.info('\n'.join(('', header, evalsummary, coverage)))
	return top