Exemplo n.º 1
0
def annotate(sentno):
    """Serve the main annotation page for a sentence."""
    username = session['username']
    if sentno == -1:
        sentno = firstunannotated(username)
        redirect(url_for('annotate', sentno=sentno))
    session['actions'] = [0, 0, 0, 0, 0, 0, 0, time()]
    lineno = QUEUE[sentno - 1][0]
    sent = SENTENCES[lineno]
    senttok, _ = worker.postokenize(sent)
    annotation, n = getannotation(username, lineno)
    if annotation is not None:
        item = exporttree(annotation.splitlines(), functions='add')
        canonicalize(item.tree)
        worker.domorph(item.tree)
        tree = writediscbrackettree(item.tree, item.sent)
        return redirect(
            url_for('edit', sentno=sentno, annotated=1, tree=tree, n=n))
    return render_template('annotate.html',
                           prevlink=str(sentno - 1) if sentno > 1 else '#',
                           nextlink=str(sentno +
                                        1) if sentno < len(SENTENCES) else '#',
                           sentno=sentno,
                           lineno=lineno + 1,
                           totalsents=len(SENTENCES),
                           numannotated=numannotated(username),
                           annotationhelp=ANNOTATIONHELP,
                           sent=' '.join(senttok))
Exemplo n.º 2
0
def test_optimalbinarize():
	"""Verify that all optimal parsing complexities are lower than or
	equal to the complexities of right-to-left binarizations."""
	from discodop.treetransforms import optimalbinarize, complexityfanout
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, (tree, sent) in enumerate(zip(list(
			corpus.trees().values())[:-2000], corpus.sents().values())):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('opt. bin. violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Exemplo n.º 3
0
def test_optimalbinarize():
	"""Verify that all optimal parsing complexities are lower than or
	equal to the complexities of right-to-left binarizations."""
	from discodop.treetransforms import optimalbinarize, complexityfanout
	from discodop.treebank import NegraCorpusReader
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	total = violations = violationshd = 0
	for n, (tree, sent) in enumerate(zip(list(
			corpus.trees().values())[:-2000], corpus.sents().values())):
		t = addbitsets(tree)
		if all(fanout(x) == 1 for x in t.subtrees()):
			continue
		print(n, tree, '\n', ' '.join(sent))
		total += 1
		optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1)
		# undo head-ordering to get a normal right-to-left binarization
		normbin = addbitsets(binarize(canonicalize(Tree.convert(tree))))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('non-hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violations += 1

		optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1)
		normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1))
		if (max(map(complexityfanout, optbin.subtrees()))
				> max(map(complexityfanout, normbin.subtrees()))):
			print('hd\n', tree)
			print(max(map(complexityfanout, optbin.subtrees())), optbin)
			print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n')
			violationshd += 1
	print('opt. bin. violations normal: %d / %d;  hd: %d / %d' % (
			violations, total, violationshd, total))
	assert violations == violationshd == 0
Exemplo n.º 4
0
 def generate(url):
     yield ('<!doctype html>'
            '<title>redirect</title>'
            'You were logged in successfully. ')
     if username in WORKERS:
         try:
             _ = WORKERS[username].submit(worker.getprop,
                                          'headrules').result()
         except BrokenProcessPool:
             pass  # fall through
         else:
             yield "<script>window.location.replace('%s');</script>" % url
             return
     yield 'Loading grammar; this will take a few seconds ...'
     _, lang = os.path.split(os.path.basename(app.config['GRAMMAR']))
     app.logger.info('Loading grammar %r', lang)
     pool = ProcessPoolExecutor(max_workers=1)
     if False and app.config['DEBUG']:
         from discodop.treesearch import NoFuture
         future = NoFuture(worker.loadgrammar, app.config['GRAMMAR'],
                           app.config['LIMIT'])
     else:
         future = pool.submit(worker.loadgrammar, app.config['GRAMMAR'],
                              app.config['LIMIT'])
     future.result()
     app.logger.info('Grammar %r loaded.', lang)
     # train on annotated sentences
     annotations = readannotations()
     if annotations:
         app.logger.info('training on %d previously annotated sentences',
                         len(annotations))
         trees, sents = [], []
         headrules = pool.submit(worker.getprop, 'headrules').result()
         for block in annotations.values():
             item = exporttree(block.splitlines())
             canonicalize(item.tree)
             applyheadrules(item.tree, headrules)
             trees.append(item.tree)
             sents.append(item.sent)
         if False and app.config['DEBUG']:
             future = NoFuture(worker.augment, trees, sents)
         else:
             future = pool.submit(worker.augment, trees, sents)
         future.result()
     WORKERS[username] = pool
     yield "<script>window.location.replace('%s');</script>" % url
Exemplo n.º 5
0
def test_transforms():
    """Test reversibility of Tiger transformations."""
    from discodop.treebanktransforms import transform, reversetransform, \
      bracketings
    from discodop.treebank import NegraCorpusReader, handlefunctions
    headrules = None  # 'alpino.headrules'
    n = NegraCorpusReader('alpinosample.export', headrules=headrules)
    nn = NegraCorpusReader('alpinosample.export', headrules=headrules)
    transformations = ('S-RC', 'VP-GF', 'NP')
    trees = [
        transform(tree, sent, transformations)
        for tree, sent in zip(nn.trees().values(),
                              nn.sents().values())
    ]
    print('\ntransformed')
    correct = exact = e = 0
    for a, b, c, d in islice(
            zip(n.trees().values(),
                n.sents().values(), trees, count()), 100):
        transformc = reversetransform(c.copy(True), b, transformations)
        c1 = bracketings(canonicalize(a))
        c2 = bracketings(canonicalize(transformc))
        z = -1  # 825
        if c1 != c2 or e == z:
            precision = len(set(c1) & set(c2)) / len(set(c1))
            recall = len(set(c1) & set(c2)) / len(set(c2))
            if precision != 1.0 or recall != 1.0 or d == z:
                print(
                    d, ' '.join(':'.join((str(n), a.encode('unicode-escape')))
                                for n, a in enumerate(b)))
                print('no match', precision, recall)
                print(len(c1), len(c2), 'gold-transformed',
                      set(c2) - set(c1), 'transformed-gold',
                      set(c1) - set(c2))
                print(a)
                print(transformc)
                handlefunctions('add', a)
                print(a, '\n', b, '\n\n')
            else:
                correct += 1
        else:
            exact += 1
            correct += 1
        e += 1
    print('matches', correct, '/', e, 100 * correct / e, '%')
    print('exact', exact)
Exemplo n.º 6
0
def testtransforms():
	""" Test whether the Tiger transformations (transform / reversetransform)
	are reversible. """
	from discodop.treetransforms import canonicalize
	from discodop.treebank import NegraCorpusReader, handlefunctions
	headrules = None  # 'alpino.headrules'
	n = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules)
	nn = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules)
	transformations = ('S-RC', 'VP-GF', 'NP')
	trees = [transform(tree, sent, transformations)
			for tree, sent in zip(nn.parsed_sents().values(),
				nn.sents().values())]
	print('\ntransformed')
	correct = exact = d = 0
	for a, b, c in islice(zip(n.parsed_sents().values(),
			trees, n.sents().values()), 100):
		transformb = reversetransform(b.copy(True), transformations)
		b1 = bracketings(canonicalize(a))
		b2 = bracketings(canonicalize(transformb))
		z = -1  # 825
		if b1 != b2 or d == z:
			precision = len(set(b1) & set(b2)) / len(set(b1))
			recall = len(set(b1) & set(b2)) / len(set(b2))
			if precision != 1.0 or recall != 1.0 or d == z:
				print(d, ' '.join(':'.join((str(n),
					a.encode('unicode-escape'))) for n, a in enumerate(c)))
				print('no match', precision, recall)
				print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1),
						'transformed-gold', set(b1) - set(b2))
				print(a)
				print(transformb)
				handlefunctions('add', a)
				print(a)
				print(b)
				print()
			else:
				correct += 1
		else:
			exact += 1
			correct += 1
		d += 1
	print('matches', correct, '/', d, 100 * correct / d, '%')
	print('exact', exact)
Exemplo n.º 7
0
def test_transforms():
	"""Test reversibility of Tiger transformations."""
	from discodop.treebanktransforms import transform, reversetransform, \
			bracketings
	from discodop.treebank import NegraCorpusReader, handlefunctions
	headrules = None  # 'alpino.headrules'
	n = NegraCorpusReader('alpinosample.export', headrules=headrules)
	nn = NegraCorpusReader('alpinosample.export', headrules=headrules)
	transformations = ('S-RC', 'VP-GF', 'NP')
	trees = [transform(tree, sent, transformations)
			for tree, sent in zip(nn.trees().values(),
				nn.sents().values())]
	print('\ntransformed')
	correct = exact = e = 0
	for a, b, c, d in islice(zip(n.trees().values(),
			n.sents().values(), trees, count()), 100):
		transformc = reversetransform(c.copy(True), transformations)
		c1 = bracketings(canonicalize(a))
		c2 = bracketings(canonicalize(transformc))
		z = -1  # 825
		if c1 != c2 or e == z:
			precision = len(set(c1) & set(c2)) / len(set(c1))
			recall = len(set(c1) & set(c2)) / len(set(c2))
			if precision != 1.0 or recall != 1.0 or d == z:
				print(d, ' '.join(':'.join((str(n),
					a.encode('unicode-escape'))) for n, a in enumerate(b)))
				print('no match', precision, recall)
				print(len(c1), len(c2), 'gold-transformed', set(c2) - set(c1),
						'transformed-gold', set(c1) - set(c2))
				print(a)
				print(transformc)
				handlefunctions('add', a)
				print(a, '\n', b, '\n\n')
			else:
				correct += 1
		else:
			exact += 1
			correct += 1
		e += 1
	print('matches', correct, '/', e, 100 * correct / e, '%')
	print('exact', exact)
Exemplo n.º 8
0
	def test_transform(self):
		from discodop.treebanktransforms import transform, reversetransform, \
				bracketings
		from discodop.treebank import NegraCorpusReader
		n = NegraCorpusReader('alpinosample.export')
		for transformations in (
				('FUNC-NODE', ),
				('MORPH-NODE', ),
				('LEMMA-NODE', ),
				('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')):
			nn = NegraCorpusReader('alpinosample.export')
			trees = [transform(tree, sent, transformations)
					for tree, sent in zip(nn.trees().values(),
						nn.sents().values())]
			for a, b in islice(zip(n.trees().values(), trees), 100):
				before = bracketings(canonicalize(a))
				transformb = reversetransform(b.copy(True), transformations)
				after = bracketings(canonicalize(transformb))
				assert before == after, (
						'mismatch with %r\nbefore: %r\nafter: %r' % (
						transformations, before, after))
Exemplo n.º 9
0
 def test_transform(self):
     from discodop.treebanktransforms import transform, reversetransform, \
       bracketings
     from discodop.treebank import NegraCorpusReader
     n = NegraCorpusReader('alpinosample.export')
     for transformations in (('FUNC-NODE', ), ('MORPH-NODE', ),
                             ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE',
                                                'LEMMA-NODE')):
         nn = NegraCorpusReader('alpinosample.export')
         trees = [
             transform(tree, sent, transformations)
             for tree, sent in zip(nn.trees().values(),
                                   nn.sents().values())
         ]
         for a, b in islice(zip(n.trees().values(), trees), 100):
             before = bracketings(canonicalize(a))
             transformb = reversetransform(b.copy(True), transformations)
             after = bracketings(canonicalize(transformb))
             assert before == after, (
                 'mismatch with %r\nbefore: %r\nafter: %r' %
                 (transformations, before, after))
Exemplo n.º 10
0
def parse(args, stdinput):
    """Parse a given sentence after inducing a grammar from a given corpus.

    Parameters
    ----------
    args : list(str)
        The list of arguments: corpus file, sentence.
    stdinput : list(str)
        The pruning policy which should be used for the parsing process.

    """

    # assign the parameter values
    corpus = TigerXMLCorpusReader(args[0], encoding='utf8')
    sent = args[1]

    # create grammar and gold trees
    trees = [
        ImmutableTree.convert(canonicalize(t))
        for t in list(corpus.trees().values())
    ]
    sentences = list(corpus.sents().values())
    grammar = Grammar(trees, sentences)
    goldtrees = [t for s, t in zip(sentences, trees) if ' '.join(s) == sent]

    # create initial pruning policy
    pp = deserialize(stdinput, FEATURES, grammar) if\
        stdinput else PruningPolicy()

    # create derivation tree
    parser = Parser(grammar)
    derivationgraph = parser.parse(sent, pp)
    derivationtree = derivationgraph.get_tree()
    stdout.flush()

    # print results
    if isinstance(derivationtree, Tree):
        # print graphical representation if the sentence could be parsed
        print(derivationtree.pprint())
        drawtree = DrawTree(derivationtree, sent.split())
        print("\n derivation tree: \n" + drawtree.text())
    else:
        # otherwise print a error message
        print(derivationtree)
    if len(goldtrees) > 0:
        # print graphical representation if there is a gold tree
        drawgold = DrawTree(goldtrees[0], sent.split())
        print("\n gold tree: \n" + drawgold.text())
        # print recall if both trees are available
        if isinstance(derivationtree, Tree):
            print("\n recall: %f" % accuracy(derivationtree, goldtrees[0]))
Exemplo n.º 11
0
def dobinarization(trees, sents, binarization, relationalrealizational):
	"""Apply binarization."""
	# fixme: this n should correspond to sentence id
	tbfanout, n = treebank.treebankfanout(trees)
	logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s',
			tbfanout, n, trees[n], ' '.join(sents[n]))
	# binarization
	begin = time.clock()
	msg = 'binarization: %s' % binarization.method
	if binarization.fanout_marks_before_bin:
		trees = [treetransforms.addfanoutmarkers(t) for t in trees]
	if binarization.method is None:
		pass
	elif binarization.method == 'default':
		msg += ' %s h=%d v=%d %s' % (
				binarization.factor, binarization.h, binarization.v,
				'tailmarker' if binarization.tailmarker else '')
		for a in trees:
			treetransforms.binarize(a, factor=binarization.factor,
					tailmarker=binarization.tailmarker,
					horzmarkov=binarization.h, vertmarkov=binarization.v,
					leftmostunary=binarization.leftmostunary,
					rightmostunary=binarization.rightmostunary,
					reverse=binarization.revmarkov,
					headidx=-1 if binarization.markhead else None,
					filterfuncs=(relationalrealizational['ignorefunctions']
						+ (relationalrealizational['adjunctionlabel'], ))
						if relationalrealizational else (),
					labelfun=binarization.labelfun)
	elif binarization.method == 'optimal':
		trees = [Tree.convert(treetransforms.optimalbinarize(tree))
						for n, tree in enumerate(trees)]
	elif binarization.method == 'optimalhead':
		msg += ' h=%d v=%d' % (
				binarization.h, binarization.v)
		trees = [Tree.convert(treetransforms.optimalbinarize(
				tree, headdriven=True, h=binarization.h, v=binarization.v))
				for n, tree in enumerate(trees)]
	trees = [treetransforms.addfanoutmarkers(t) for t in trees]
	logging.info('%s; cpu time elapsed: %gs',
			msg, time.clock() - begin)
	trees = [treetransforms.canonicalize(a).freeze() for a in trees]
	return trees
Exemplo n.º 12
0
def replacesubtree():
    n = int(request.args.get('n', 0))
    sentno = int(request.args.get('sentno'))  # 1-indexed
    sent = SENTENCES[QUEUE[sentno - 1][0]]
    senttok, _ = worker.postokenize(sent)
    username = session['username']
    treestr = request.args.get('tree', '')
    try:
        tree, _sent1 = validate(treestr, senttok)
    except ValueError as err:
        return str(err)
    error = ''
    dt = DrawTree(tree, senttok)
    _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_')
    nodeid = int(nodeid)
    subseq = sorted(dt.nodes[nodeid].leaves())
    subsent = ' '.join(senttok[n] for n in subseq)
    root = dt.nodes[nodeid].label
    resp = WORKERS[username].submit(worker.getparses,
                                    subsent, (), (),
                                    root=root).result()
    _senttok, parsetrees, _messages, _elapsed = resp
    newsubtree = parsetrees[n - 1][1]
    pos = sorted(list(newsubtree.subtrees(lambda n: isinstance(n[0], int))),
                 key=lambda n: n[0])
    for n, a in enumerate(pos):
        a[0] = subseq[n]
    dt.nodes[nodeid][:] = newsubtree[:]
    tree = canonicalize(dt.nodes[0])
    dt = DrawTree(tree, senttok)  # kludge..
    treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip()
    session['actions'][REPARSE] += 1
    session.modified = True
    link = ('<a href="/annotate/accept?%s">accept this tree</a>' %
            urlencode(dict(sentno=sentno, tree=treestr)))
    return Markup('%s\n\n%s%s\t%s' % (link, error,
                                      dt.text(unicodelines=True,
                                              html=True,
                                              funcsep='-',
                                              morphsep='/',
                                              nodeprops='t0'), treestr))
Exemplo n.º 13
0
def train(args, stdinput):
    """Train a given pruning policy.

    Parameters
    ----------
    args : list(str)
        List of arguments: corpus file, number of iterations, weight.
    stdinput : list(str)
        The initial pruning policy as std input.

    """

    # assign the parameter values
    pp = PruningPolicy(stdinput)
    corpus = TigerXMLCorpusReader(args[0], encoding='utf8')
    iterations = int(args[1]) if len(args) >= 2 and args[1] else 1
    weight = float(args[2]) if len(args) >= 3 and args[2] else 1
    slength = int(args[3]) if len(args) >= 4 and args[3] else maxsize
    nsents = int(args[4]) if len(args) >= 5 and args[4] else maxsize
    feats = str(args[5]).split('-') if len(args) >= 6 else None

    # create grammar and corpus
    trees = [
        ImmutableTree.convert(canonicalize(t))
        for t in list(corpus.trees().values())
    ]
    sentences = list(corpus.sents().values())
    grammar = Grammar(trees, sentences)
    simplecorpus = [(s, _t) for s, _t in list(zip(sentences, trees))
                    if len(s) <= slength]
    if nsents:
        simplecorpus = simplecorpus[:nsents]

    # print the trained pruning policy into the console
    newpp = lols(grammar, simplecorpus, pp, iterations, weight, feats)
    stdout.write(newpp.serialize())
Exemplo n.º 14
0
def main():
	"""Command line interface to create grammars from treebanks."""
	from getopt import gnu_getopt, GetoptError
	from discodop.treetransforms import addfanoutmarkers, canonicalize
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	shortoptions = 's:'
	options = 'gzip packed bitpar inputfmt= inputenc= dopestimator= numproc='
	try:
		opts, args = gnu_getopt(sys.argv[1:], shortoptions, options.split())
		model = args[0]
		if model not in ('info', 'merge'):
			treebankfile, grammarfile = args[1:]
	except (GetoptError, IndexError, ValueError) as err:
		print('error: %r\n%s' % (err, USAGE))
		sys.exit(2)
	opts = dict(opts)
	if model not in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop', 'ptsg',
			'param', 'info', 'merge'):
		raise ValueError('unrecognized model: %r' % model)
	if opts.get('dopestimator', 'rfe') not in ('rfe', 'ewe', 'shortest'):
		raise ValueError('unrecognized estimator: %r' % opts['dopestimator'])

	if model == 'info':
		grammarstats(args[1])
		return
	elif model == 'merge':
		if len(args) < 5:
			raise ValueError('need at least 2 input and 1 output arguments.')
		if args[1] == 'rules':
			merge(args[2:-1], args[-1], sumrules, stripweight)
		elif args[1] == 'lexicon':
			merge(args[2:-1], args[-1], sumlex, lambda x: x.split(None, 1)[0])
		elif args[1] == 'fragments':
			merge(args[2:-1], args[-1], sumfrags, lambda x: x.rsplit('\t', 1)[0])
		return
	elif model == 'param':
		import os
		from discodop.runexp import readparam, loadtraincorpus, getposmodel
		from discodop.parser import DictObj
		if opts:
			raise ValueError('all options should be set in parameter file.')
		prm = DictObj(readparam(args[1]))
		resultdir = args[2]
		if os.path.exists(resultdir):
			raise ValueError('Directory %r already exists.\n' % resultdir)
		os.mkdir(resultdir)
		trees, sents, train_tagged_sents = loadtraincorpus(
				prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct,
				prm.functions, prm.morphology, prm.removeempty, prm.ensureroot,
				prm.transformations, prm.relationalrealizational)
		simplelexsmooth = False
		if prm.postagging and prm.postagging.method == 'unknownword':
			sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents)
			simplelexsmooth = prm.postagging.simplelexsmooth
	elif model == 'ptsg':  # read fragments
		splittedlines = (line.split('\t') for line in io.open(treebankfile,
					encoding=opts.get('--inputenc', 'utf8')))
		fragments = {(fields[0] if len(fields) == 2 else
				(fields[0], [a or None for a in fields[1].split(' ')])):
					convertweight(fields[-1]) for fields in splittedlines}
	else:  # read treebank
		corpus = READERS[opts.get('--inputfmt', 'export')](
				treebankfile,
				encoding=opts.get('--inputenc', 'utf8'))
		trees = list(corpus.trees().values())
		sents = list(corpus.sents().values())
		if not trees:
			raise ValueError('no trees; is --inputfmt correct?')
		for a in trees:
			canonicalize(a)
			addfanoutmarkers(a)

	# read off grammar
	if model in ('pcfg', 'plcfrs'):
		grammar = treebankgrammar(trees, sents)
	elif model == 'dopreduction':
		grammar, altweights = dopreduction(trees, sents,
				packedgraph='--packed' in opts)
	elif model == 'doubledop':
		grammar, backtransform, altweights, _ = doubledop(trees, sents,
				numproc=int(opts.get('--numproc', 1)),
				binarized='--bitpar' not in opts)
	elif model == 'ptsg':
		grammar, backtransform, altweights = compiletsg(fragments,
				binarized='--bitpar' not in opts)
	elif model == 'param':
		from discodop.runexp import dobinarization, getgrammars
		getgrammars(dobinarization(trees, sents, prm.binarization,
				prm.relationalrealizational),
				sents, prm.stages, prm.testcorpus.maxwords, resultdir,
				prm.numproc, lexmodel, simplelexsmooth, trees[0].label)
		open(os.path.join(resultdir, 'params.prm'), "w").write(
				"top='%s',\n%s" % (trees[0].label, open(args[1]).read()))
		return  # grammars have already been written
	if opts.get('--dopestimator', 'rfe') != 'rfe':
		grammar = [(rule, w) for (rule, _), w in
				zip(grammar, altweights[opts['--dopestimator']])]

	rulesname = grammarfile + '.rules'
	lexiconname = grammarfile + '.lex'
	myopen = open
	if '--gzip' in opts:
		myopen = gzip.open
		rulesname += '.gz'
		lexiconname += '.gz'
	bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket'
	if model == 'ptsg':
		bitpar = not isinstance(next(iter(fragments)), tuple)
	if '--bitpar' in opts and not bitpar:
		raise ValueError('parsing with an unbinarized grammar requires '
				'a grammar in bitpar format.')

	rules, lexicon = write_lcfrs_grammar(grammar, bitpar=bitpar)
	# write output
	with myopen(rulesname, 'w') as rulesfile:
		rulesfile.write(rules)
	with codecs.getwriter('utf-8')(myopen(lexiconname, 'w')) as lexiconfile:
		lexiconfile.write(lexicon)
	if model in ('doubledop', 'ptsg'):
		backtransformfile = '%s.backtransform%s' % (grammarfile,
			'.gz' if '--gzip' in opts else '')
		myopen(backtransformfile, 'w').writelines(
				'%s\n' % a for a in backtransform)
		print('wrote backtransform to', backtransformfile)
	print('wrote grammar to %s and %s.' % (rulesname, lexiconname))
	if len(grammar) < 10000:  # this is very slow so skip with large grammars
		print(grammarinfo(grammar))
	try:
		from discodop.containers import Grammar
		print(Grammar(rules, lexicon, bitpar=bitpar,
				binarized='--bitpar' not in opts, start=opts.get('-s',
					next(iter(grammar))[0][0][0] if model == 'ptsg'
					else trees[0].label)).testgrammar()[1])
	except (ImportError, AssertionError) as err:
		print(err)
Exemplo n.º 15
0
def reattach():
    """Re-draw tree after re-attaching node under new parent."""
    sentno = int(request.args.get('sentno'))  # 1-indexed
    sent = SENTENCES[QUEUE[sentno - 1][0]]
    senttok, _ = worker.postokenize(sent)
    treestr = request.args.get('tree', '')
    try:
        tree, _sent1 = validate(treestr, senttok)
    except ValueError as err:
        return str(err)
    dt = DrawTree(tree, senttok)
    error = ''
    if request.args.get('newparent') == 'deletenode':
        # remove nodeid by replacing it with its children
        _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_')
        nodeid = int(nodeid)
        x = dt.nodes[nodeid]
        if nodeid == 0 or isinstance(x[0], int):
            error = 'ERROR: cannot remove ROOT or POS node'
        else:
            children = list(x)
            x[:] = []
            for y in dt.nodes[0].subtrees():
                if any(child is x for child in y):
                    i = y.index(x)
                    y[i:i + 1] = children
                    tree = canonicalize(dt.nodes[0])
                    dt = DrawTree(tree, senttok)  # kludge..
                    break
    elif request.args.get('nodeid', '').startswith('newlabel_'):
        # splice in a new node under parentid
        _treeid, newparent = request.args.get('newparent',
                                              '').lstrip('t').split('_')
        newparent = int(newparent)
        label = request.args.get('nodeid').split('_', 1)[1]
        y = dt.nodes[newparent]
        if isinstance(y[0], int):
            error = 'ERROR: cannot add node under POS tag'
        else:
            children = list(y)
            y[:] = []
            y[:] = [Tree(label, children)]
            tree = canonicalize(dt.nodes[0])
            dt = DrawTree(tree, senttok)  # kludge..
    else:  # re-attach existing node at existing new parent
        _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_')
        nodeid = int(nodeid)
        _treeid, newparent = request.args.get('newparent',
                                              '').lstrip('t').split('_')
        newparent = int(newparent)
        # remove node from old parent
        # dt.nodes[nodeid].parent.pop(dt.nodes[nodeid].parent_index)
        x = dt.nodes[nodeid]
        y = dt.nodes[newparent]
        for node in x.subtrees():
            if node is y:
                error = ('ERROR: cannot re-attach subtree'
                         ' under (descendant of) itself\n')
                break
        else:
            for node in dt.nodes[0].subtrees():
                if any(child is x for child in node):
                    if len(node) > 1:
                        node.remove(x)
                        dt.nodes[newparent].append(x)
                        tree = canonicalize(dt.nodes[0])
                        dt = DrawTree(tree, senttok)  # kludge..
                    else:
                        error = ('ERROR: re-attaching only child creates'
                                 ' empty node %s; remove manually\n' % node)
                    break
    treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip()
    link = ('<a href="/annotate/accept?%s">accept this tree</a>' %
            urlencode(dict(sentno=sentno, tree=treestr)))
    if error == '':
        session['actions'][REATTACH] += 1
        session.modified = True
    return Markup('%s\n\n%s%s\t%s' % (link, error,
                                      dt.text(unicodelines=True,
                                              html=True,
                                              funcsep='-',
                                              morphsep='/',
                                              nodeprops='t0'), treestr))
Exemplo n.º 16
0
def getgrammars(trees, sents, stages, bintype, horzmarkov, vertmarkov, factor,
		tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead,
		fanout_marks_before_bin, testmaxwords, resultdir, numproc,
		lexmodel, simplelexsmooth, top, relationalrealizational):
	""" Apply binarization and read off the requested grammars. """
	# fixme: this n should correspond to sentence id
	tbfanout, n = treebankfanout(trees)
	logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s',
			tbfanout, n, trees[n], ' '.join(sents[n]))
	# binarization
	begin = time.clock()
	if fanout_marks_before_bin:
		trees = [addfanoutmarkers(t) for t in trees]
	if bintype == 'binarize':
		bintype += ' %s h=%d v=%d %s' % (factor, horzmarkov, vertmarkov,
			'tailmarker' if tailmarker else '')
		for a in trees:
			binarize(a, factor=factor, tailmarker=tailmarker,
					horzmarkov=horzmarkov, vertmarkov=vertmarkov,
					leftmostunary=leftmostunary, rightmostunary=rightmostunary,
					reverse=revmarkov, pospa=pospa,
					headidx=-1 if markhead else None,
					filterfuncs=(relationalrealizational['ignorefunctions']
						+ (relationalrealizational['adjunctionlabel'], ))
						if relationalrealizational else ())
	elif bintype == 'optimal':
		trees = [Tree.convert(optimalbinarize(tree))
						for n, tree in enumerate(trees)]
	elif bintype == 'optimalhead':
		trees = [Tree.convert(optimalbinarize(tree, headdriven=True,
				h=horzmarkov, v=vertmarkov)) for n, tree in enumerate(trees)]
	trees = [addfanoutmarkers(t) for t in trees]
	logging.info('binarized %s cpu time elapsed: %gs',
						bintype, time.clock() - begin)
	logging.info('binarized treebank fan-out: %d #%d', *treebankfanout(trees))
	trees = [canonicalize(a).freeze() for a in trees]

	for n, stage in enumerate(stages):
		if stage.split:
			traintrees = [binarize(splitdiscnodes(Tree.convert(a),
					stage.markorigin), childchar=':').freeze() for a in trees]
			logging.info('splitted discontinuous nodes')
		else:
			traintrees = trees
		if stage.mode.startswith('pcfg'):
			assert tbfanout == 1 or stage.split
		backtransform = None
		if stage.dop:
			if stage.usedoubledop:
				# find recurring fragments in treebank,
				# as well as depth 1 'cover' fragments
				fragments = getfragments(traintrees, sents, numproc,
						iterate=stage.iterate, complement=stage.complement)
				xgrammar, backtransform, altweights = doubledop(
						traintrees, fragments)
			else:  # DOP reduction
				xgrammar, altweights = dopreduction(
						traintrees, sents, packedgraph=stage.packedgraph)
			nodes = sum(len(list(a.subtrees())) for a in traintrees)
			if lexmodel and simplelexsmooth:
				newrules = simplesmoothlexicon(lexmodel)
				xgrammar.extend(newrules)
				for weights in altweights.values():
					weights.extend(w for _, w in newrules)
			elif lexmodel:
				xgrammar = smoothlexicon(xgrammar, lexmodel)
			msg = grammarinfo(xgrammar)
			rules, lexicon = write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			grammar = Grammar(rules, lexicon, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			for name in altweights:
				grammar.register(u'%s' % name, altweights[name])
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lexicon)
			logging.info('DOP model based on %d sentences, %d nodes, '
				'%d nonterminals', len(traintrees), nodes, len(grammar.toid))
			logging.info(msg)
			if stage.estimator != 'dop1':
				grammar.switch(u'%s' % stage.estimator)
			_sumsto1 = grammar.testgrammar()
			if stage.usedoubledop:
				# backtransform keys are line numbers to rules file;
				# to see them together do:
				# $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz)
				with codecs.getwriter('ascii')(gzip.open(
						'%s/%s.backtransform.gz' % (resultdir, stage.name),
						'w')) as out:
					out.writelines('%s\n' % a for a in backtransform)
				if n and stage.prune:
					msg = grammar.getmapping(stages[n - 1].grammar,
						striplabelre=None if stages[n - 1].dop
							else re.compile(b'@.+$'),
						neverblockre=re.compile(b'.+}<'),
						splitprune=stage.splitprune and stages[n - 1].split,
						markorigin=stages[n - 1].markorigin)
				else:
					# recoverfragments() relies on this mapping to identify
					# binarization nodes
					msg = grammar.getmapping(None,
						striplabelre=None,
						neverblockre=re.compile(b'.+}<'),
						splitprune=False, markorigin=False)
				logging.info(msg)
			elif n and stage.prune:  # dop reduction
				msg = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=None if stages[n - 1].dop
						and not stages[n - 1].usedoubledop
						else re.compile(b'@[-0-9]+$'),
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				if stage.mode == 'dop-rerank':
					grammar.getrulemapping(stages[n - 1].grammar)
				logging.info(msg)
			# write prob models
			np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name),
					**{name: mod for name, mod
						in zip(grammar.modelnames, grammar.models)})
		else:  # not stage.dop
			xgrammar = treebankgrammar(traintrees, sents)
			logging.info('induced %s based on %d sentences',
				('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'),
				len(traintrees))
			if stage.split or os.path.exists('%s/pcdist.txt' % resultdir):
				logging.info(grammarinfo(xgrammar))
			else:
				logging.info(grammarinfo(xgrammar,
						dump='%s/pcdist.txt' % resultdir))
			if lexmodel and simplelexsmooth:
				newrules = simplesmoothlexicon(lexmodel)
				xgrammar.extend(newrules)
			elif lexmodel:
				xgrammar = smoothlexicon(xgrammar, lexmodel)
			rules, lexicon = write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			grammar = Grammar(rules, lexicon, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lexicon)
			_sumsto1 = grammar.testgrammar()
			if n and stage.prune:
				msg = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=None,
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				logging.info(msg)
		logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir,
				stage.name, ',backtransform' if stage.usedoubledop else '')

		outside = None
		if stage.getestimates == 'SX':
			assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.'
			logging.info('computing PCFG estimates')
			begin = time.clock()
			outside = getpcfgestimates(grammar, testmaxwords,
					grammar.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
					time.clock() - begin)
			np.savez('pcfgoutside.npz', outside=outside)
			logging.info('saved PCFG estimates')
		elif stage.useestimates == 'SX':
			assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.'
			assert stage.mode != 'pcfg', (
					'estimates require agenda-based parser.')
			outside = np.load('pcfgoutside.npz')['outside']
			logging.info('loaded PCFG estimates')
		if stage.getestimates == 'SXlrgaps':
			logging.info('computing PLCFRS estimates')
			begin = time.clock()
			outside = getestimates(grammar, testmaxwords,
					grammar.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
						time.clock() - begin)
			np.savez('outside.npz', outside=outside)
			logging.info('saved estimates')
		elif stage.useestimates == 'SXlrgaps':
			outside = np.load('outside.npz')['outside']
			logging.info('loaded PLCFRS estimates')
		stage.update(grammar=grammar, backtransform=backtransform,
				outside=outside)
Exemplo n.º 17
0
def main():
	""" Command line interface to create grammars from treebanks. """
	import gzip
	from getopt import gnu_getopt, GetoptError
	from discodop.treetransforms import addfanoutmarkers, canonicalize
	from discodop.treebank import getreader, splitpath
	from discodop.fragments import getfragments
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	shortoptions = ''
	flags = ('gzip', 'packed')
	options = ('inputfmt=', 'inputenc=', 'dopestimator=', 'numproc=')
	try:
		opts, args = gnu_getopt(sys.argv[1:], shortoptions, flags + options)
		model, treebankfile, grammarfile = args
	except (GetoptError, ValueError) as err:
		print('error: %r\n%s' % (err, USAGE))
		sys.exit(2)
	opts = dict(opts)
	assert model in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop'), (
		'unrecognized model: %r' % model)
	assert opts.get('dopestimator', 'dop1') in ('dop1', 'ewe', 'shortest'), (
		'unrecognized estimator: %r' % opts['dopestimator'])

	# read treebank
	reader = getreader(opts.get('--inputfmt', 'export'))
	corpus = reader(*splitpath(treebankfile),
			encoding=opts.get('--inputenc', 'utf8'))
	trees = list(corpus.parsed_sents().values())
	sents = list(corpus.sents().values())
	for a in trees:
		canonicalize(a)
		addfanoutmarkers(a)

	# read off grammar
	if model in ('pcfg', 'plcfrs'):
		grammar = treebankgrammar(trees, sents)
	elif model == 'dopreduction':
		grammar, altweights = dopreduction(trees, sents,
				packedgraph='--packed' in opts)
	elif model == 'doubledop':
		numproc = int(opts.get('--numproc', 1))
		fragments = getfragments(trees, sents, numproc)
		grammar, backtransform, altweights = doubledop(trees, fragments)
	if opts.get('--dopestimator', 'dop1') == 'ewe':
		grammar = [(rule, w) for (rule, _), w in
				zip(grammar, altweights['ewe'])]
	elif opts.get('--dopestimator', 'dop1') == 'shortest':
		grammar = [(rule, w) for (rule, _), w in
				zip(grammar, altweights['shortest'])]

	print(grammarinfo(grammar))
	rules = grammarfile + '.rules'
	lexicon = grammarfile + '.lex'
	if '--gzip' in opts:
		myopen = gzip.open
		rules += '.gz'
		lexicon += '.gz'
	else:
		myopen = open
	bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket'
	rules, lexicon = write_lcfrs_grammar(grammar, bitpar=bitpar)
	try:
		from discodop.containers import Grammar
	except ImportError:
		pass
	else:
		cgrammar = Grammar(rules, lexicon)
		cgrammar.testgrammar()
	# write output
	with myopen(rules, 'w') as rulesfile:
		rulesfile.write(rules)
	with codecs.getwriter('utf-8')(myopen(lexicon, 'w')) as lexiconfile:
		lexiconfile.write(lexicon)
	if model == 'doubledop':
		backtransformfile = '%s.backtransform%s' % (grammarfile,
			'.gz' if '--gzip' in opts else '')
		myopen(backtransformfile, 'w').writelines(
				'%s\n' % a for a in backtransform)
		print('wrote backtransform to', backtransformfile)
	print('wrote grammar to %s and %s.' % (rules, lexicon))