Exemplo n.º 1
0
	def postprocess(self, treestr, stage=-1):
		"""Take parse tree and apply postprocessing."""
		parsetree = Tree.parse(treestr, parse_leaf=int)
		if self.stages[stage].split:
			mergediscnodes(unbinarize(parsetree, childchar=':',
					expandunary=False))
		saveheads(parsetree, self.binarization.tailmarker)
		unbinarize(parsetree, expandunary=False)
		removefanoutmarkers(parsetree)
		if self.relationalrealizational:
			parsetree = rrbacktransform(parsetree,
					self.relationalrealizational['adjunctionlabel'])
		if self.transformations:
			reversetransform(parsetree, self.transformations)
		return parsetree, False
Exemplo n.º 2
0
	def postprocess(self, treestr, stage=-1, derivs=None):
		""" Take parse tree and apply postprocessing. """
		parsetree = Tree.parse(treestr, parse_leaf=int)
		if self.stages[stage].split:
			mergediscnodes(unbinarize(parsetree, childchar=':'))
		saveheads(parsetree, self.tailmarker)
		unbinarize(parsetree)
		removefanoutmarkers(parsetree)
		if self.relationalrealizational:
			parsetree = rrbacktransform(parsetree,
					self.relationalrealizational['adjunctionlabel'])
		if self.transformations:
			reversetransform(parsetree, self.transformations)
		fragments = derivs.get(treestr) if derivs else None
		return parsetree, fragments, False
Exemplo n.º 3
0
def accept():
    """Store parse & redirect to next sentence."""
    # should include n referring to which n-best tree is to be accepted,
    # or tree in discbracket format if tree was manually edited.
    sentno = int(request.args.get('sentno'))  # 1-indexed
    lineno = QUEUE[sentno - 1][0]
    sent = SENTENCES[lineno]
    username = session['username']
    actions = session['actions']
    actions[TIME] = int(round(time() - actions[TIME]))
    if 'dec' in request.args:
        actions[DECTREE] += int(request.args.get('dec', 0))
    if 'tree' in request.args:
        n = 0
        tree, senttok = discbrackettree(request.args.get('tree'))
        reversetransform(tree, senttok, ('APPEND-FUNC', 'addCase'))
    else:
        n = int(request.args.get('n', 0))
        require = request.args.get('require', '')
        block = request.args.get('block', '')
        require, block = parseconstraints(require, block)
        resp = WORKERS[username].submit(worker.getparses, sent, require,
                                        block).result()
        senttok, parsetrees, _messages, _elapsed = resp
        tree = parsetrees[n - 1][1]
        for node in tree.subtrees():
            node.label = LABELRE.match(node.label).group(1)
    actions[NBEST] = n
    session.modified = True
    block = writetree(tree,
                      senttok,
                      str(lineno + 1),
                      'export',
                      comment='%s %r' % (username, actions))
    app.logger.info(block)
    addentry(lineno, block, actions)
    WORKERS[username].submit(worker.augment, [tree], [senttok])
    flash('Your annotation for sentence %d was stored %r' % (sentno, actions))
    return (redirect(url_for('annotate', sentno=sentno +
                             1)) if sentno < len(SENTENCES) else
            'THANK YOU. THAT WAS THE LAST SENTENCE.')
Exemplo n.º 4
0
def test_transforms():
    """Test reversibility of Tiger transformations."""
    from discodop.treebanktransforms import transform, reversetransform, \
      bracketings
    from discodop.treebank import NegraCorpusReader, handlefunctions
    headrules = None  # 'alpino.headrules'
    n = NegraCorpusReader('alpinosample.export', headrules=headrules)
    nn = NegraCorpusReader('alpinosample.export', headrules=headrules)
    transformations = ('S-RC', 'VP-GF', 'NP')
    trees = [
        transform(tree, sent, transformations)
        for tree, sent in zip(nn.trees().values(),
                              nn.sents().values())
    ]
    print('\ntransformed')
    correct = exact = e = 0
    for a, b, c, d in islice(
            zip(n.trees().values(),
                n.sents().values(), trees, count()), 100):
        transformc = reversetransform(c.copy(True), b, transformations)
        c1 = bracketings(canonicalize(a))
        c2 = bracketings(canonicalize(transformc))
        z = -1  # 825
        if c1 != c2 or e == z:
            precision = len(set(c1) & set(c2)) / len(set(c1))
            recall = len(set(c1) & set(c2)) / len(set(c2))
            if precision != 1.0 or recall != 1.0 or d == z:
                print(
                    d, ' '.join(':'.join((str(n), a.encode('unicode-escape')))
                                for n, a in enumerate(b)))
                print('no match', precision, recall)
                print(len(c1), len(c2), 'gold-transformed',
                      set(c2) - set(c1), 'transformed-gold',
                      set(c1) - set(c2))
                print(a)
                print(transformc)
                handlefunctions('add', a)
                print(a, '\n', b, '\n\n')
            else:
                correct += 1
        else:
            exact += 1
            correct += 1
        e += 1
    print('matches', correct, '/', e, 100 * correct / e, '%')
    print('exact', exact)
Exemplo n.º 5
0
def test_transforms():
	"""Test reversibility of Tiger transformations."""
	from discodop.treebanktransforms import transform, reversetransform, \
			bracketings
	from discodop.treebank import NegraCorpusReader, handlefunctions
	headrules = None  # 'alpino.headrules'
	n = NegraCorpusReader('alpinosample.export', headrules=headrules)
	nn = NegraCorpusReader('alpinosample.export', headrules=headrules)
	transformations = ('S-RC', 'VP-GF', 'NP')
	trees = [transform(tree, sent, transformations)
			for tree, sent in zip(nn.trees().values(),
				nn.sents().values())]
	print('\ntransformed')
	correct = exact = d = 0
	for a, b, c in islice(zip(n.trees().values(),
			trees, n.sents().values()), 100):
		transformb = reversetransform(b.copy(True), transformations)
		b1 = bracketings(canonicalize(a))
		b2 = bracketings(canonicalize(transformb))
		z = -1  # 825
		if b1 != b2 or d == z:
			precision = len(set(b1) & set(b2)) / len(set(b1))
			recall = len(set(b1) & set(b2)) / len(set(b2))
			if precision != 1.0 or recall != 1.0 or d == z:
				print(d, ' '.join(':'.join((str(n),
					a.encode('unicode-escape'))) for n, a in enumerate(c)))
				print('no match', precision, recall)
				print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1),
						'transformed-gold', set(b1) - set(b2))
				print(a)
				print(transformb)
				handlefunctions('add', a)
				print(a, '\n', b, '\n\n')
			else:
				correct += 1
		else:
			exact += 1
			correct += 1
		d += 1
	print('matches', correct, '/', d, 100 * correct / d, '%')
	print('exact', exact)
Exemplo n.º 6
0
 def test_transform(self):
     from discodop.treebanktransforms import transform, reversetransform, \
       bracketings
     from discodop.treebank import NegraCorpusReader
     n = NegraCorpusReader('alpinosample.export')
     for transformations in (('FUNC-NODE', ), ('MORPH-NODE', ),
                             ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE',
                                                'LEMMA-NODE')):
         nn = NegraCorpusReader('alpinosample.export')
         trees = [
             transform(tree, sent, transformations)
             for tree, sent in zip(nn.trees().values(),
                                   nn.sents().values())
         ]
         for a, b in islice(zip(n.trees().values(), trees), 100):
             before = bracketings(canonicalize(a))
             transformb = reversetransform(b.copy(True), transformations)
             after = bracketings(canonicalize(transformb))
             assert before == after, (
                 'mismatch with %r\nbefore: %r\nafter: %r' %
                 (transformations, before, after))
Exemplo n.º 7
0
	def test_transform(self):
		from discodop.treebanktransforms import transform, reversetransform, \
				bracketings
		from discodop.treebank import NegraCorpusReader
		n = NegraCorpusReader('alpinosample.export')
		for transformations in (
				('FUNC-NODE', ),
				('MORPH-NODE', ),
				('LEMMA-NODE', ),
				('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')):
			nn = NegraCorpusReader('alpinosample.export')
			trees = [transform(tree, sent, transformations)
					for tree, sent in zip(nn.trees().values(),
						nn.sents().values())]
			for a, b in islice(zip(n.trees().values(), trees), 100):
				before = bracketings(canonicalize(a))
				transformb = reversetransform(b.copy(True), transformations)
				after = bracketings(canonicalize(transformb))
				assert before == after, (
						'mismatch with %r\nbefore: %r\nafter: %r' % (
						transformations, before, after))
Exemplo n.º 8
0
def main():
	"""Command line interface for applying tree(bank) transforms."""
	import io
	from getopt import gnu_getopt, GetoptError
	from discodop import treebanktransforms
	actions = {'none': None, 'introducepreterminals': introducepreterminals,
			'splitdisc': None, 'mergedisc': mergediscnodes, 'transform': None,
			'unbinarize': unbinarize, 'binarize': None, 'optimalbinarize': None}
	flags = ('markorigin markheads leftunary rightunary tailmarker '
			'renumber reverse'.split())
	options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= '
			'punct= headrules= functions= morphology= lemmas= factor= '
			'markorigin= maxlen= fmt= enc= transforms=').split()
	try:
		opts, args = gnu_getopt(sys.argv[1:], 'h:v:', flags + options)
		if not 1 <= len(args) <= 3:
			raise GetoptError('error: expected 1, 2, or 3 positional arguments')
	except GetoptError as err:
		print('error: %r\n%s' % (err, USAGE), file=sys.stderr)
		sys.exit(2)
	opts, action = dict(opts), args[0]
	if action not in actions:
		print('unrecognized action: %r\navailable actions: %s' % (
				action, ', '.join(actions)), file=sys.stderr)
		sys.exit(2)
	if '--fmt' in opts:
		opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt']
	if '--enc' in opts:
		opts['--inputenc'] = opts['--outputenc'] = opts['--enc']
	if opts.get('--outputfmt', WRITERS[0]) not in WRITERS:
		print('unrecognized output format: %r\navailable formats: %s' % (
				opts.get('--outputfmt'), ' '.join(WRITERS)), file=sys.stderr)
		sys.exit(2)
	infilename = args[1] if len(args) >= 2 and args[1] != '-' else '/dev/stdin'
	outfilename = args[2] if len(args) == 3 and args[2] != '-' else '/dev/stdout'

	# open corpus
	corpus = READERS[opts.get('--inputfmt', 'export')](
			infilename,
			encoding=opts.get('--inputenc', 'utf-8'),
			headrules=opts.get('--headrules'), markheads='--markheads' in opts,
			ensureroot=opts.get('--ensureroot'), punct=opts.get('--punct'),
			functions=opts.get('--functions'),
			morphology=opts.get('--morphology'),
			lemmas=opts.get('--lemmas'))
	start, end = opts.get('--slice', ':').split(':')
	start, end = (int(start) if start else None), (int(end) if end else None)
	trees = corpus.itertrees(start, end)
	if '--maxlen' in opts:
		maxlen = int(opts['--maxlen'])
		trees = ((key, (tree, sent)) for key, (tree, sent) in trees
				if len(sent) <= maxlen)
	if '--renumber' in opts:
		trees = (('%8d' % n, treesent)
				for n, (_, treesent) in enumerate(trees, 1))

	# select transformation
	transform = actions[action]
	if action in ('binarize', 'optimalbinarize'):
		h = int(opts.get('-h', 999))
		v = int(opts.get('-v', 1))
		if action == 'binarize':
			factor = opts.get('--factor', 'right')
			transform = lambda t, _: binarize(t, factor, h, v,
					leftmostunary='--leftunary' in opts,
					rightmostunary='--rightunary' in opts,
					tailmarker='$' if '--tailmarker' in opts else '')
		elif action == 'optimalbinarize':
			headdriven = '--headrules' in opts
			transform = lambda t, _: optimalbinarize(t, '|', headdriven, h, v)
	elif action == 'splitdisc':
		transform = lambda t, _: splitdiscnodes(t, '--markorigin' in opts)
	elif action == 'unbinarize':
		transform = lambda t, _: unbinarize(Tree.convert(t))
	elif action == 'transform':
		tfs = opts['--transforms'].split(',')
		transform = lambda t, s: (treebanktransforms.reversetransform(t, tfs)
				if '--reverse' in opts
				else treebanktransforms.transform(t, s, tfs))
	if transform is not None:  # NB: transform cannot affect (no. of) terminals
		trees = ((key, (transform(tree, sent), sent)) for key, (tree, sent) in trees)

	# read, transform, & write trees
	headrules = None
	if opts.get('--outputfmt') in ('mst', 'conll'):
		if not opts.get('--headrules'):
			raise ValueError('need head rules for dependency conversion')
		headrules = treebanktransforms.readheadrules(opts.get('--headrules'))
	cnt = 0
	if opts.get('--outputfmt') == 'dact':
		import alpinocorpus
		outfile = alpinocorpus.CorpusWriter(outfilename)
		if (action == 'none' and opts.get('--inputfmt') in ('alpino', 'dact')
				and set(opts) <= {'--slice', '--inputfmt', '--outputfmt',
				'--renumber'}):
			for n, (key, block) in islice(enumerate(
					corpus.blocks().items(), 1), start, end):
				outfile.write('%8d' % n if '--renumber' in opts else key, block)
				cnt += 1
		else:
			for key, (tree, sent) in trees:
				outfile.write(str(key), writetree(tree, sent, key, 'alpino'))
				cnt += 1
	else:
		encoding = opts.get('outputenc', 'utf-8')
		outfile = io.open(outfilename, 'w', encoding=encoding)
		# copy trees verbatim when only taking slice or converting encoding
		if (action == 'none' and opts.get('--inputfmt') == opts.get(
				'--outputfmt') and set(opts) <= {'--slice', '--inputenc',
				'--outputenc', '--inputfmt', '--outputfmt'}):
			for block in islice(corpus.blocks().values(), start, end):
				outfile.write(block)
				cnt += 1
		else:
			for key, (tree, sent) in trees:
				outfile.write(writetree(tree, sent, key,
						opts.get('--outputfmt', 'export'), headrules))
				cnt += 1
	print('%sed %d trees with action %r' % ('convert' if action == 'none'
			else 'transform', cnt, action), file=sys.stderr)