示例#1
0
def build_param(path='../util/proper.prm'):
    param = readparam(path if path else None)
    # param['CUTOFF_LEN'] = int(opts.get('--cutofflen', param['CUTOFF_LEN']))
    # param['DISC_ONLY'] = '--disconly' in opts
    # param['DEBUG'] = max(param['DEBUG'],
    #       '--verbose' in opts, 2 * ('--debug' in opts))
    # param['TED'] |= '--ted' in opts
    # param['LA'] |= '--la' in opts
    # param['DEP'] = '--headrules' in opts
    return param
示例#2
0
def test_eval():
	"""Simple sanity check; should give 100% score on all metrics."""
	from discodop.treebank import READERS
	from discodop.eval import Evaluator, readparam
	gold = READERS['export']('alpinosample.export')
	parses = READERS['export']('alpinosample.export')
	goldtrees, goldsents, candsents = gold.trees(), gold.sents(), parses.sents()
	evaluator = Evaluator(readparam(None))
	for n, ctree in parses.trees().items():
		evaluator.add(n, goldtrees[n], goldsents[n], ctree, candsents[n])
	evaluator.breakdowns()
	print(evaluator.summary())
示例#3
0
def test_eval():
	"""Simple sanity check; should give 100% score on all metrics."""
	from discodop.treebank import READERS
	from discodop.eval import Evaluator, readparam
	gold = READERS['export']('alpinosample.export')
	parses = READERS['export']('alpinosample.export')
	goldtrees, goldsents, candsents = gold.trees(), gold.sents(), parses.sents()
	evaluator = Evaluator(readparam(None))
	for n, ctree in parses.trees().items():
		evaluator.add(n, goldtrees[n], goldsents[n], ctree, candsents[n])
	evaluator.breakdowns()
	print(evaluator.summary())
示例#4
0
from configparser import ConfigParser
from supertagging.data import SupertagParseDataset

config = ConfigParser()
config.read(argv[1])

data = SupertagParseDataset(f"{config['Corpus']['filename']}.train")

from discodop.tree import ParentedTree, Tree
from discodop.treetransforms import unbinarize, removefanoutmarkers
from discodop.eval import Evaluator, readparam
from discodop.lexgrammar import SupertagGrammar

grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb"))
i = 0
evaluator = Evaluator(readparam("proper.prm"))
for sentence in data:
    words = tuple(t.text for t in sentence)
    poss = tuple(t.get_tag("pos").value for t in sentence)
    tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence)
    parses = grammar.parse(poss, tags, posmode=True)
    try:
        parse = next(parses)
    except StopIteration:
        leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words))))
        parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})")
    gold = ParentedTree(sentence.get_labels("tree")[0].value)
    gold = ParentedTree.convert(
        unbinarize(removefanoutmarkers(Tree.convert(gold))))
    parse = ParentedTree.convert(
        unbinarize(removefanoutmarkers(Tree.convert(parse))))
示例#5
0
 def set_eval_param(self, config: EvalParameters):
     self.__grammar__.fallback_prob = config.fallbackprob
     self.__evalparam__ = readparam(config.evalfilename)
     self.__ktags__ = config.ktags
示例#6
0
def startexp(
		stages=(DEFAULTSTAGE, ),  # see above
		corpusfmt='export',  # choices: export, discbracket, bracket
		corpusdir='.',
		# filenames may include globbing characters '*' and '?'.
		traincorpus='alpinosample.export', trainencoding='utf-8',
		testcorpus='alpinosample.export', testencoding='utf-8',
		testmaxwords=40,
		trainmaxwords=40,
		trainnumsents=2,
		testnumsents=1,  # number of sentences to parse
		skiptrain=True,  # test set starts after training set
		# (useful when they are in the same file)
		skip=0,  # number of sentences to skip from test corpus
		punct=None,  # choices: None, 'move', 'remove', 'root'
		functions=None,  # choices None, 'add', 'remove', 'replace'
		morphology=None,  # choices: None, 'add', 'replace', 'between'
		transformations=None,  # apply treebank transformations
		# postagging: pass None to use tags from treebank.
		postagging=None,
		relationalrealizational=None,  # do not apply RR-transform
		headrules=None,  # rules for finding heads of constituents
		bintype='binarize',  # choices: binarize, optimal, optimalhead
		factor='right',
		revmarkov=True,
		v=1,
		h=2,
		pospa=False,  # when v > 1, add parent annotation to POS tags?
		markhead=False,  # prepend head to siblings
		leftmostunary=True,  # start binarization with unary node
		rightmostunary=True,  # end binarization with unary node
		tailmarker='',  # with headrules, head is last node and can be marked
		fanout_marks_before_bin=False,
		evalparam='proper.prm',  # EVALB-style parameter file
		quiet=False, reallyquiet=False,  # quiet=no per sentence results
		numproc=1,  # increase to use multiple CPUs; None: use all CPUs.
		resultdir='results',
		rerun=False):
	""" Execute an experiment. """
	assert bintype in ('optimal', 'optimalhead', 'binarize')
	if postagging is not None:
		assert set(postagging).issubset({'method', 'model',
				'unknownthreshold', 'openclassthreshold', 'simplelexsmooth'})
		if postagging['method'] == 'unknownword':
			assert postagging['model'] in ('4', '6', 'base')
			assert postagging['unknownthreshold'] >= 1
			assert postagging['openclassthreshold'] >= 0
		else:
			assert postagging['method'] in ('treetagger', 'stanford')

	if rerun:
		assert os.path.exists(resultdir), (
				'Directory %r does not exist.'
				'--rerun requires a directory '
				'with the grammar(s) of a previous experiment.'
				% resultdir)
	else:
		assert not os.path.exists(resultdir), (
			'Directory %r exists.\n'
			'Use --rerun to parse with existing grammar '
			'and overwrite previous results.' % resultdir)
		os.mkdir(resultdir)

	# Log everything, and send it to stderr, in a format with just the message.
	formatstr = '%(message)s'
	if reallyquiet:
		logging.basicConfig(level=logging.WARNING, format=formatstr)
	elif quiet:
		logging.basicConfig(level=logging.INFO, format=formatstr)
	else:
		logging.basicConfig(level=logging.DEBUG, format=formatstr)

	# also log to a file
	fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
	#fileobj.setLevel(logging.INFO)
	fileobj.setLevel(logging.DEBUG)
	fileobj.setFormatter(logging.Formatter(formatstr))
	logging.getLogger('').addHandler(fileobj)

	corpusreader = getreader(corpusfmt)
	if not rerun:
		corpus = corpusreader(corpusdir, traincorpus, encoding=trainencoding,
				headrules=headrules, headfinal=True, headreverse=False,
				punct=punct, functions=functions, morphology=morphology)
		logging.info('%d sentences in training corpus %s/%s',
				len(corpus.parsed_sents()), corpusdir, traincorpus)
		if isinstance(trainnumsents, float):
			trainnumsents = int(trainnumsents * len(corpus.sents()))
		trees = list(corpus.parsed_sents().values())[:trainnumsents]
		sents = list(corpus.sents().values())[:trainnumsents]
		if transformations:
			trees = [transform(tree, sent, transformations)
					for tree, sent in zip(trees, sents)]
		if relationalrealizational:
			trees = [rrtransform(tree, **relationalrealizational)[0]
					for tree in trees]
		train_tagged_sents = [[(word, tag) for word, (_, tag)
				in zip(sent, sorted(tree.pos()))]
					for tree, sent in zip(trees, sents)]
		blocks = list(corpus.blocks().values())[:trainnumsents]
		assert trees, 'training corpus should be non-empty'
		logging.info('%d training sentences before length restriction',
				len(trees))
		trees, sents, blocks = zip(*[sent for sent in zip(trees, sents, blocks)
			if len(sent[1]) <= trainmaxwords])
		logging.info('%d training sentences after length restriction <= %d',
			len(trees), trainmaxwords)

	testset = corpusreader(corpusdir, testcorpus, encoding=testencoding,
			punct=punct, morphology=morphology, functions=functions)
	gold_sents = testset.tagged_sents()
	test_parsed_sents = testset.parsed_sents()
	if skiptrain:
		skip += trainnumsents
	logging.info('%d sentences in test corpus %s/%s',
			len(testset.parsed_sents()), corpusdir, testcorpus)
	logging.info('%d test sentences before length restriction',
			len(list(gold_sents)[skip:skip + testnumsents]))
	lexmodel = None
	test_tagged_sents = gold_sents
	if postagging and postagging['method'] in ('treetagger', 'stanford'):
		if postagging['method'] == 'treetagger':
			# these two tags are never given by tree-tagger,
			# so collect words whose tag needs to be overriden
			overridetags = ('PTKANT', 'PIDAT')
		elif postagging['method'] == 'stanford':
			overridetags = ('PTKANT', )
		taglex = defaultdict(set)
		for sent in train_tagged_sents:
			for word, tag in sent:
				taglex[word].add(tag)
		overridetagdict = {tag:
			{word for word, tags in taglex.items() if tags == {tag}}
			for tag in overridetags}
		tagmap = {'$(': '$[', 'PAV': 'PROAV'}
		sents_to_tag = OrderedDict((a, b) for a, b
				in islice(gold_sents.items(), skip, skip + testnumsents)
				if len(b) <= testmaxwords),
		test_tagged_sents = externaltagging(postagging['method'],
				postagging['model'], sents_to_tag, overridetagdict, tagmap)
		# give these tags to parser
		usetags = True
	elif postagging and postagging['method'] == 'unknownword' and not rerun:
		postagging['unknownwordfun'] = getunknownwordfun(postagging['model'])
		# get smoothed probalities for lexical productions
		lexresults, msg = getunknownwordmodel(
				train_tagged_sents, postagging['unknownwordfun'],
				postagging['unknownthreshold'],
				postagging['openclassthreshold'])
		logging.info(msg)
		simplelexsmooth = postagging['simplelexsmooth']
		if simplelexsmooth:
			lexmodel = lexresults[2:8]
		else:
			lexmodel, msg = getlexmodel(*lexresults)
			logging.info(msg)
		# NB: knownwords are all words in training set, lexicon is the subset
		# of words that are above the frequency threshold.
		# for training purposes we work with the subset, at test time we exploit
		# the full set of known words from the training set.
		sigs, knownwords, lexicon = lexresults[:3]
		postagging['sigs'], postagging['lexicon'] = sigs, knownwords
		# replace rare train words with signatures
		sents = replaceraretrainwords(train_tagged_sents,
				postagging['unknownwordfun'], lexicon)
		# make sure gold POS tags are not given to parser
		usetags = False
	elif postagging and postagging['method'] == 'unknownword' and rerun:
		usetags = False
	else:
		simplelexsmooth = False
		# give gold POS tags to parser
		usetags = True

	# 0: test sentences as they should be handed to the parser,
	# 1: gold trees for evaluation purposes
	# 2: gold sentence because test sentences may be mangled by unknown word
	#   model
	# 3: blocks from treebank file to reproduce the relevant part of the
	#   original treebank verbatim.
	testset = OrderedDict((a, (test_tagged_sents[a], test_parsed_sents[a],
			gold_sents[a], block)) for a, block
			in islice(testset.blocks().items(), skip, skip + testnumsents)
			if len(test_tagged_sents[a]) <= testmaxwords)
	assert test_tagged_sents, 'test corpus should be non-empty'
	logging.info('%d test sentences after length restriction <= %d',
			len(testset), testmaxwords)

	if rerun:
		trees = []
		sents = []
	toplabels = {tree.label for tree in trees} | {
			test_parsed_sents[n].label for n in testset}
	assert len(toplabels) == 1, 'expected unique ROOT label: %r' % toplabels
	top = toplabels.pop()

	if rerun:
		readgrammars(resultdir, stages, postagging, top)
	else:
		logging.info('read training & test corpus')
		getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker,
				revmarkov, leftmostunary, rightmostunary, pospa, markhead,
				fanout_marks_before_bin, testmaxwords, resultdir, numproc,
				lexmodel, simplelexsmooth, top, relationalrealizational)
	evalparam = evalmod.readparam(evalparam)
	evalparam['DEBUG'] = -1
	evalparam['CUTOFF_LEN'] = 40
	deletelabel = evalparam.get('DELETE_LABEL', ())
	deleteword = evalparam.get('DELETE_WORD', ())

	begin = time.clock()
	parser = Parser(stages, transformations=transformations,
			tailmarker=tailmarker, postagging=postagging if postagging
			and postagging['method'] == 'unknownword' else None,
			relationalrealizational=relationalrealizational)
	results = doparsing(parser=parser, testset=testset, resultdir=resultdir,
			usetags=usetags, numproc=numproc, deletelabel=deletelabel,
			deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology)
	if numproc == 1:
		logging.info('time elapsed during parsing: %gs', time.clock() - begin)
	for result in results[0]:
		nsent = len(result.parsetrees)
		header = (' ' + result.name.upper() + ' ').center(35, '=')
		evalsummary = evalmod.doeval(OrderedDict((a, b.copy(True))
				for a, b in test_parsed_sents.items()), gold_sents,
				result.parsetrees, test_tagged_sents if usetags else gold_sents,
				evalparam)
		coverage = 'coverage: %s = %6.2f' % (
				('%d / %d' % (nsent - result.noparse, nsent)).rjust(
				25 if any(len(a) > evalparam['CUTOFF_LEN']
				for a in gold_sents.values()) else 14),
				100.0 * (nsent - result.noparse) / nsent)
		logging.info('\n'.join(('', header, evalsummary, coverage)))
	return top
示例#7
0
def startexp(
		stages=(parser.DictObj(parser.DEFAULTSTAGE), ),  # see parser module
		corpusfmt='export',  # choices: export, (disc)bracket, alpino, tiger
		traincorpus=parser.DictObj(DEFAULTS['traincorpus']),
		testcorpus=parser.DictObj(DEFAULTS['testcorpus']),
		binarization=parser.DictObj(DEFAULTS['binarization']),
		removeempty=False,  # whether to remove empty terminals
		ensureroot=None,  # ensure every tree has a root node with this label
		punct=None,  # choices: None, 'move', 'remove', 'root'
		functions=None,  # choices None, 'add', 'remove', 'replace'
		morphology=None,  # choices: None, 'add', 'replace', 'between'
		transformations=None,  # apply treebank transformations
		postagging=None,  # postagging: pass None to use tags from treebank.
		relationalrealizational=None,  # do not apply RR-transform
		evalparam='proper.prm',  # EVALB-style parameter file
		verbosity=2,
		numproc=1,  # increase to use multiple CPUs; None: use all CPUs.
		resultdir='results',
		rerun=False):
	"""Execute an experiment."""
	if rerun:
		if not os.path.exists(resultdir):
			raise ValueError('Directory %r does not exist.\n--rerun requires a'
					' directory with the grammar(s) of a previous experiment.'
					% resultdir)
	else:
		if os.path.exists(resultdir):
			raise ValueError('Directory %r exists.\n'
					'Use --rerun to parse with existing grammar '
					'and overwrite previous results.' % resultdir)
		os.mkdir(resultdir)

	# Log everything, and send it to stderr, in a format with just the message.
	formatstr = '%(message)s'
	if verbosity == 0:
		logging.basicConfig(level=logging.WARNING, format=formatstr)
	elif verbosity == 1:
		logging.basicConfig(level=logging.INFO, format=formatstr)
	elif verbosity == 2:
		logging.basicConfig(level=logging.DEBUG, format=formatstr)
	elif 3 <= verbosity <= 4:
		logging.basicConfig(level=5, format=formatstr)
	else:
		raise ValueError('verbosity should be >= 0 and <= 4. ')

	# also log to a file
	fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
	fileobj.setLevel(logging.DEBUG)
	fileobj.setFormatter(logging.Formatter(formatstr))
	logging.getLogger('').addHandler(fileobj)

	if not rerun:
		trees, sents, train_tagged_sents = loadtraincorpus(
				corpusfmt, traincorpus, binarization, punct, functions,
				morphology, removeempty, ensureroot, transformations,
				relationalrealizational)
	elif isinstance(traincorpus.numsents, float):
		raise ValueError('need to specify number of training set sentences, '
				'not fraction, in rerun mode.')

	testsettb = treebank.READERS[corpusfmt](
			testcorpus.path, encoding=testcorpus.encoding,
			removeempty=removeempty, morphology=morphology,
			functions=functions, ensureroot=ensureroot)
	if isinstance(testcorpus.numsents, float):
		testcorpus.numsents = int(testcorpus.numsents
				* len(testsettb.blocks()))
	if testcorpus.skiptrain:
		testcorpus.skip += (  # pylint: disable=maybe-no-member
				traincorpus.numsents)  # pylint: disable=maybe-no-member

	test_blocks = OrderedDict()
	test_trees = OrderedDict()
	test_tagged_sents = OrderedDict()
	for n, a in islice(testsettb._read_blocks(),
			testcorpus.skip, testcorpus.skip  # pylint: disable=maybe-no-member
				+ testcorpus.numsents):
		tree, sent = testsettb._parsetree(a)
		if 1 <= len(sent) <= testcorpus.maxwords:
			test_blocks[n] = testsettb._strblock(n, a)
			test_trees[n] = tree
			test_tagged_sents[n] = [(word, tag) for word, (_, tag)
					in zip(sent, sorted(tree.pos()))]
	logging.info('%d test sentences after length restriction <= %d',
			len(test_trees), testcorpus.maxwords)
	lexmodel = None
	simplelexsmooth = False
	test_tagged_sents_mangled = test_tagged_sents
	if postagging and postagging.method in ('treetagger', 'stanford', 'frog'):
		if postagging.method == 'treetagger':
			# these two tags are never given by tree-tagger,
			# so collect words whose tag needs to be overriden
			overridetags = ('PTKANT', 'PIDAT')
		elif postagging.method == 'stanford':
			overridetags = ('PTKANT', )
		elif postagging.method == 'frog':
			overridetags = ()
		taglex = defaultdict(set)
		for sent in train_tagged_sents:
			for word, tag in sent:
				taglex[word].add(tag)
		overridetagdict = {tag:
			{word for word, tags in taglex.items() if tags == {tag}}
			for tag in overridetags}
		tagmap = {'$(': '$[', 'PAV': 'PROAV'}
		test_tagged_sents_mangled = lexicon.externaltagging(postagging.method,
				postagging.model, test_tagged_sents, overridetagdict, tagmap)
		if postagging.retag and not rerun:
			logging.info('re-tagging training corpus')
			sents_to_tag = OrderedDict(enumerate(train_tagged_sents))
			train_tagged_sents = lexicon.externaltagging(postagging.method,
					postagging.model, sents_to_tag, overridetagdict,
					tagmap).values()
			for tree, tagged in zip(trees, train_tagged_sents):
				for node in tree.subtrees(
						lambda n: len(n) == 1 and isinstance(n[0], int)):
					node.label = tagged[node[0]][1]
		usetags = True  # give these tags to parser
	elif postagging and postagging.method == 'unknownword':
		if not rerun:
			sents, lexmodel = getposmodel(postagging, train_tagged_sents)
			simplelexsmooth = postagging.simplelexsmooth
		usetags = False  # make sure gold POS tags are not given to parser
	else:
		usetags = True  # give gold POS tags to parser

	# 0: test sentences as they should be handed to the parser,
	# 1: gold trees for evaluation purposes
	# 2: gold sents because test sentences may be mangled by unknown word model
	# 3: blocks from treebank file to reproduce the relevant part of the
	#   original treebank verbatim.
	testset = OrderedDict((n, (
				test_tagged_sents_mangled[n],
				test_trees[n],
				test_tagged_sents[n],
				block))
			for n, block in test_blocks.items())
	if not test_tagged_sents:
		raise ValueError('test corpus (selection) should be non-empty.')

	if rerun:
		trees, sents = [], []
	roots = {t.label for t in trees} | {test_trees[n].label for n in testset}
	if len(roots) != 1:
		raise ValueError('expected unique ROOT label: %r' % roots)
	top = roots.pop()

	if rerun:
		parser.readgrammars(resultdir, stages, postagging, top)
	else:
		logging.info('read training & test corpus')
		getgrammars(dobinarization(trees, sents, binarization,
					relationalrealizational),
				sents, stages, testcorpus.maxwords, resultdir, numproc,
				lexmodel, simplelexsmooth, top)
	evalparam = evalmod.readparam(evalparam)
	evalparam['DEBUG'] = -1
	evalparam['CUTOFF_LEN'] = 40
	deletelabel = evalparam.get('DELETE_LABEL', ())
	deleteword = evalparam.get('DELETE_WORD', ())

	begin = time.clock()
	theparser = parser.Parser(stages, transformations=transformations,
			binarization=binarization, postagging=postagging if postagging
				and postagging.method == 'unknownword' else None,
			relationalrealizational=relationalrealizational,
			verbosity=verbosity)
	results = doparsing(parser=theparser, testset=testset, resultdir=resultdir,
			usetags=usetags, numproc=numproc, deletelabel=deletelabel,
			deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology,
			evalparam=evalparam)
	if numproc == 1:
		logging.info('time elapsed during parsing: %gs', time.clock() - begin)
	for result in results:
		nsent = len(result.parsetrees)
		overcutoff = any(len(a) > evalparam['CUTOFF_LEN']
				for a in test_tagged_sents.values())
		header = (' ' + result.name.upper() + ' ').center(
				44 if overcutoff else 35, '=')
		evalsummary = result.evaluator.summary()
		coverage = 'coverage: %s = %6.2f' % (
				('%d / %d' % (nsent - result.noparse, nsent)).rjust(
				25 if overcutoff else 14),
				100.0 * (nsent - result.noparse) / nsent)
		logging.info('\n'.join(('', header, evalsummary, coverage)))
	return top