Exemplo n.º 1
0
    def test_mergedicsnodes(self):
        tree = Tree.parse(
            '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)'
            '(VVPP 5)) (VAINF 6)) (VMFIN 3))',
            parse_leaf=int)
        assert str(mergediscnodes(splitdiscnodes(tree))) == (
            '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
            '(VAINF 6)) (VMFIN 3))')

        assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == (
            '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
            '(VAINF 6)) (VMFIN 3))')

        tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))',
                          parse_leaf=int)
        assert str(mergediscnodes(splitdiscnodes(
            tree, markorigin=True))) == ('(S (X (A 0) (A 2)) (X (A 1) (A 3)))')

        tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))',
                          parse_leaf=int)
        assert str(splitdiscnodes(tree, markorigin=True)) == (
            '(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))')

        tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))',
                          parse_leaf=int)
        assert str(mergediscnodes(
            splitdiscnodes(tree))) == ('(S (X (A 0) (A 1) (A 2) (A 3)))')
Exemplo n.º 2
0
def brackettree(treestr, sent, brackets, strtermre):
	""" Parse a single tree presented in bracket format, whether with indices
	or not; sent may be None / empty. """
	if strtermre.search(treestr):  # terminals are not all indices
		treestr = FRONTIERNTRE.sub(' ...)', treestr)
		sent = TERMINALSRE.findall(treestr)
		cnt = count()
		tree = Tree.parse(treestr, brackets=brackets,
				parse_leaf=lambda x: next(cnt))
	else:  # disc. trees with integer indices as terminals
		tree = Tree.parse(treestr, parse_leaf=int,
			brackets=brackets)
		sent = (sent.split() if sent.strip()
				else map(str, range(max(tree.leaves()) + 1)))
	return tree, sent
Exemplo n.º 3
0
	def __init__(self, tree, sent=None, highlight=(), abbr=False):
		self.tree = tree
		self.sent = sent
		if isinstance(tree, basestring):
			self.tree = Tree.parse(tree,
					parse_leaf=None if sent is None else int)
		if sent is None:
			leaves = self.tree.leaves()
			if (leaves and not any(len(a) == 0 for a in self.tree.subtrees())
					and all(isinstance(a, int) for a in leaves)):
				self.sent = [str(a) for a in leaves]
			else:
				# this deals with empty nodes (frontier non-terminals)
				# and multiple/mixed terminals under non-terminals.
				self.tree = self.tree.copy(True)
				self.sent = []
				for a in self.tree.subtrees():
					if len(a) == 0:
						a.append(len(self.sent))
						self.sent.append(None)
					elif any(not isinstance(b, Tree) for b in a):
						for n, b in enumerate(a):
							if not isinstance(b, Tree):
								a[n] = len(self.sent)
								self.sent.append('%s' % b)
		if abbr:
			if self.tree is tree:
				self.tree = self.tree.copy(True)
			for n in self.tree.subtrees(lambda x: len(x.label) > 5):
				n.label = n.label[:4] + u'\u2026'  # unicode '...' ellipsis
		self.highlight = set()
		self.nodes, self.coords, self.edges = self.nodecoords(
				self.tree, self.sent, highlight)
Exemplo n.º 4
0
def test():
	""" Simple demonstration. """
	a = Tree.parse("(f (d (a 0) (c (b 1))) (e 2))", parse_leaf=int)
	b = Tree.parse("(f (c (d (a 0) (b 1)) (e 2)))", parse_leaf=int)
	result1 = treedist(a, b, debug=True)
	assert result1 == 2
	print('%s\n%s\ndistance: %d' % (a, b, result1))
	result2 = newtreedist(a, b, debug=True)
	assert result2 == 2
	print('%s\n%s\ndistance: %d' % (a, b, result2))
	a = Tree.parse("(f (d (x (a 0)) (b 1) (c 2)) (z 3))", parse_leaf=int)
	b = Tree.parse("(f (c (d (a 0) (x (b 1)) (c 2)) (z 3)))", parse_leaf=int)
	result1 = treedist(a, b, debug=True)
	assert result1 == 3
	print('%s\n%s\ndistance: %d' % (a, b, result1))
	result2 = newtreedist(a, b, debug=True)
	assert result2 == 3
	print('%s\n%s\ndistance: %d' % (a, b, result2))
Exemplo n.º 5
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers
    from discodop.disambiguation import getderivations, marginalize
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, _backtransform, _, _ = doubledop(trees,
                                               sents,
                                               debug=False,
                                               numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(None,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    result, msg = grammar.testgrammar()
    assert result, 'RFE should sum to 1.\n%s' % msg
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print('sentence:',
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            getderivations(chart, 100)
            _parses, _msg = marginalize('mpp', chart)
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemplo n.º 6
0
	def test_mergedicsnodes(self):
		tree = Tree.parse('(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)'
				'(VVPP 5)) (VAINF 6)) (VMFIN 3))', parse_leaf=int)
		assert str(mergediscnodes(splitdiscnodes(tree))) == (
				'(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
				'(VAINF 6)) (VMFIN 3))')

		assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == (
				'(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) '
				'(VAINF 6)) (VMFIN 3))')

		tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int)
		assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == (
				'(S (X (A 0) (A 2)) (X (A 1) (A 3)))')

		tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int)
		assert str(splitdiscnodes(tree, markorigin=True)) == (
				'(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))')

		tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int)
		assert str(mergediscnodes(splitdiscnodes(tree))) == (
				'(S (X (A 0) (A 1) (A 2) (A 3)))')
Exemplo n.º 7
0
	def noparse(self, stage, sent, tags, lastsuccessfulparse):
		"""Return parse from previous stage or a dummy parse."""
		# use successful parse from earlier stage if available
		if lastsuccessfulparse is not None:
			parsetree = lastsuccessfulparse.copy(True)
		else:  # Produce a dummy parse for evaluation purposes.
			default = defaultparse([(n, t) for n, t
					in enumerate(tags or (len(sent) * ['NONE']))])
			parsetree = Tree.parse('(%s %s)' % (stage.grammar.start,
					default), parse_leaf=int)
		noparse = True
		prob = 1.0
		return parsetree, prob, noparse
Exemplo n.º 8
0
	def postprocess(self, treestr, stage=-1, derivs=None):
		""" Take parse tree and apply postprocessing. """
		parsetree = Tree.parse(treestr, parse_leaf=int)
		if self.stages[stage].split:
			mergediscnodes(unbinarize(parsetree, childchar=':'))
		saveheads(parsetree, self.tailmarker)
		unbinarize(parsetree)
		removefanoutmarkers(parsetree)
		if self.relationalrealizational:
			parsetree = rrbacktransform(parsetree,
					self.relationalrealizational['adjunctionlabel'])
		if self.transformations:
			reversetransform(parsetree, self.transformations)
		fragments = derivs.get(treestr) if derivs else None
		return parsetree, fragments, False
Exemplo n.º 9
0
	def postprocess(self, treestr, stage=-1):
		"""Take parse tree and apply postprocessing."""
		parsetree = Tree.parse(treestr, parse_leaf=int)
		if self.stages[stage].split:
			mergediscnodes(unbinarize(parsetree, childchar=':',
					expandunary=False))
		saveheads(parsetree, self.binarization.tailmarker)
		unbinarize(parsetree, expandunary=False)
		removefanoutmarkers(parsetree)
		if self.relationalrealizational:
			parsetree = rrbacktransform(parsetree,
					self.relationalrealizational['adjunctionlabel'])
		if self.transformations:
			reversetransform(parsetree, self.transformations)
		return parsetree, False
Exemplo n.º 10
0
	def trees(self, query, subset=None, maxresults=10,
			nofunc=False, nomorph=False):
		subset = subset or self.files
		# %s the sentence number
		# %w complete tree in bracket notation
		# %h the matched subtree in bracket notation
		fmt = r'%s:::%w:::%h\n'
		result = []
		jobs = {}
		for filename in subset:
			try:
				x, maxresults2 = self.cache['trees', query, filename,
						nofunc, nomorph]
			except KeyError:
				maxresults2 = 0
			if not maxresults or maxresults > maxresults2:
				jobs[self._submit(lambda x: list(self._query(
						query, x, fmt, maxresults)), filename)] = filename
			else:
				result.extend(x[:maxresults])
		for future in self._as_completed(jobs):
			filename = jobs[future]
			x = []
			for sentno, line in future.result():
				treestr, match = line.split(':::')
				treestr = filterlabels(treestr, nofunc, nomorph)
				treestr = treestr.replace(" )", " -NONE-)")
				cnt = count()
				if match.startswith('('):
					treestr = treestr.replace(match, '%s_HIGH %s' % tuple(
							match.split(None, 1)), 1)
				else:
					match = ' %s)' % match
					treestr = treestr.replace(match, '_HIGH%s' % match)
				tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt))
				sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr)
				high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH")))
				if high:
					high = high.pop()
					high.label = high.label.rsplit("_", 1)[0]
					high = list(high.subtrees()) + high.leaves()
				x.append((filename, sentno, tree, sent, high))
			self.cache['trees', query, filename,
					nofunc, nomorph] = x, maxresults
			result.extend(x)
		return result
Exemplo n.º 11
0
	def test_balancedpunctraise(self):
		tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)'
				' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP'
				' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)'
				' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)'
				' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))'
				' ($. 25))', parse_leaf=int)
		sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus "
				". . . ' , die Hayko Siemens musikalisch leitet , bietet "
				"wieder ungewoehnliche Kombinationen .".split())
		punctraise(tree, sent)
		balancedpunctraise(tree, sent)
		assert max(map(fanout, addbitsets(tree).subtrees())) == 1

		nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP '
				'(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S '
				'(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) '
				'(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int)
		assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
Exemplo n.º 12
0
	def test_balancedpunctraise(self):
		tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)'
				' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP'
				' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)'
				' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)'
				' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))'
				' ($. 25))', parse_leaf=int)
		sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus "
				". . . ' , die Hayko Siemens musikalisch leitet , bietet "
				"wieder ungewoehnliche Kombinationen .".split())
		punctraise(tree, sent)
		balancedpunctraise(tree, sent)
		assert max(map(fanout, addbitsets(tree).subtrees())) == 1

		nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP '
				'(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S '
				'(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) '
				'(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int)
		assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
Exemplo n.º 13
0
def parse():
	"""Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
	sent = request.args.get('sent', None)
	objfun = request.args.get('objfun', 'mpp')
	est = request.args.get('est', 'rfe')
	marg = request.args.get('marg', 'nbest')
	coarse = request.args.get('coarse', 'pcfg')
	html = 'html' in request.args
	lang = request.args.get('lang', 'detect')
	require = request.args.get('require', None)
	block = request.args.get('block', None)
	if not sent:
		return ''
	nbest = None
	if POSTAGS.match(sent):
		senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split()))
	else:
		senttok, tags = tuple(tokenize(sent)), None
	if not senttok or not 1 <= len(senttok) <= LIMIT:
		return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
	if lang == 'detect':
		lang = guesslang(senttok)
	elif lang not in PARSERS:
		return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
	if require:
		require = tuple((label, tuple(indices))
				for label, indices in sorted(json.loads(require)))
	if block:
		block = tuple((label, tuple(indices))
				for label, indices in sorted(json.loads(block)))
	key = (senttok, tags, est, marg, objfun, coarse, lang, require, block)
	resp = CACHE.get(key)
	if resp is None:
		urlparams = dict(sent=sent, est=est, marg=marg, objfun=objfun,
				coarse=coarse, html=html)
		if require:
			urlparams['require'] = json.dumps(require)
		if block:
			urlparams['block'] = json.dumps(block)
		link = 'parse?' + url_encode(urlparams)
		PARSERS[lang].stages[-1].estimator = est
		PARSERS[lang].stages[-1].objective = objfun
		PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
		PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
		if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
			PARSERS[lang].stages[0].mode = (
					'pcfg' if coarse == 'pcfg-posterior' else coarse)
			if len(PARSERS[lang].stages) > 1:
				PARSERS[lang].stages[1].k = (1e-5
						if coarse == 'pcfg-posterior' else 50)
		results = list(PARSERS[lang].parse(
				senttok, tags=tags, require=require, block=block))
		if results[-1].noparse:
			parsetrees = []
			result = 'no parse!'
			nbest = dep = depsvg = ''
		else:
			if SHOWMORPH:
				replacemorph(results[-1].parsetree)
			if SHOWFUNC:
				treebank.handlefunctions('add', results[-1].parsetree, pos=True)
			tree = str(results[-1].parsetree)
			prob = results[-1].prob
			parsetrees = results[-1].parsetrees or []
			parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
			parsetrees_ = []
			APP.logger.info('[%s] %s', probstr(prob), tree)
			tree = Tree.parse(tree, parse_leaf=int)
			result = Markup(DrawTree(tree, senttok).text(
					unicodelines=True, html=html, funcsep='-'))
			for tree, prob, x in parsetrees:
				tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
				if SHOWMORPH:
					replacemorph(tree)
				if SHOWFUNC:
					treebank.handlefunctions('add', tree, pos=True)
				parsetrees_.append((tree, prob, x))
			if PARSERS[lang].headrules:
				xtree = PARSERS[lang].postprocess(
						parsetrees[0][0], senttok, -1)[0]
				dep = treebank.writedependencies(xtree, senttok, 'conll')
				depsvg = Markup(DrawDependencies.fromconll(dep).svg())
			else:
				dep = depsvg = ''
			rid = randid()
			nbest = Markup('\n\n'.join('%d. [%s] '
					'<a href=\'javascript: toggle("f%s%d"); \'>'
					'derivation</a>\n'
					'<span id=f%s%d style="display: none; margin-left: 3em; ">'
					'Fragments used in the highest ranked derivation'
					' of this parse tree:\n%s</span>\n%s' % (
						n + 1,
						probstr(prob),
						rid, n + 1,
						rid, n + 1,
						'\n\n'.join('%s\n%s' % (w,
							DrawTree(frag).text(unicodelines=True, html=html))
							for frag, w in fragments or ()  # if frag.count('(') > 1
						),
						DrawTree(tree, senttok).text(
							unicodelines=True, html=html, funcsep='-'))
					for n, (tree, prob, fragments) in enumerate(parsetrees_)))
		msg = '\n'.join(stage.msg for stage in results)
		elapsed = [stage.elapsedtime for stage in results]
		elapsed = 'CPU time elapsed: %s => %gs' % (
				' '.join('%gs' % a for a in elapsed), sum(elapsed))
		info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (
				len(senttok), lang, est, objfun, marg), msg, elapsed,
				'10 most probable parse trees:',
				''.join('%d. [%s] %s' % (n + 1, probstr(prob),
						writediscbrackettree(tree, senttok))
						for n, (tree, prob, _) in enumerate(parsetrees))
				+ '\n'))
		CACHE.set(key, (sent, result, nbest, info, link, dep, depsvg),
				timeout=5000)
	else:
		(sent, result, nbest, info, link, dep, depsvg) = resp
	if html:
		return render_template('parsetree.html', sent=sent, result=result,
				nbest=nbest, info=info, link=link, dep=dep,
				depsvg=depsvg, randid=randid())
	else:
		return Response('\n'.join((nbest, info, result)),
				mimetype='text/plain')
Exemplo n.º 14
0
def getfragments(trees, sents, numproc=1, iterate=False, complement=False):
	""" Get recurring fragments with exact counts in a single treebank.

	:returns: a dictionary whose keys are fragments as strings, and
		frequencies / indices as values.
	:param trees:  a sequence of binarized Tree objects. """
	if numproc == 0:
		numproc = cpu_count()
	numtrees = len(trees)
	assert numtrees
	mult = 1  # 3 if numproc > 1 else 1
	fragments = {}
	trees = trees[:]
	work = workload(numtrees, mult, numproc)
	PARAMS.update(disc=True, indices=True, approx=False, complete=False,
			quadratic=False, complement=complement)
	if numproc == 1:
		initworkersimple(trees, list(sents))
		mymap = map
		myapply = APPLY
	else:
		logging.info("work division:\n%s", "\n".join("    %s: %r" % kv
			for kv in sorted(dict(numchunks=len(work),
				numproc=numproc).items())))
		# start worker processes
		pool = Pool(processes=numproc, initializer=initworkersimple,
			initargs=(trees, list(sents)))
		mymap = pool.map
		myapply = pool.apply
	# collect recurring fragments
	logging.info("extracting recurring fragments")
	for a in mymap(worker, work):
		fragments.update(a)
	# add 'cover' fragments corresponding to single productions
	cover = myapply(coverfragworker, ())
	before = len(fragments)
	fragments.update(cover)
	logging.info("merged %d unseen cover fragments", len(fragments) - before)
	fragmentkeys = list(fragments)
	bitsets = [fragments[a] for a in fragmentkeys]
	countchunk = len(bitsets) // numproc + 1
	work = list(range(0, len(bitsets), countchunk))
	work = [(n, len(work), bitsets[a:a + countchunk])
			for n, a in enumerate(work)]
	logging.info("getting exact counts for %d fragments", len(bitsets))
	counts = []
	for a in mymap(exactcountworker, work):
		counts.extend(a)
	if numproc != 1:
		pool.close()
		pool.join()
		del pool
	if iterate:  # optionally collect fragments of fragments
		logging.info("extracting fragments of recurring fragments")
		PARAMS['complement'] = False  # needs to be turned off if it was on
		newfrags = fragments
		trees, sents = None, None
		ids = count()
		for _ in range(10):  # up to 10 iterations
			newtrees = [binarize(
					introducepreterminals(Tree.parse(tree, parse_leaf=int),
					ids=ids), childchar="}") for tree, _ in newfrags]
			newsents = [["#%d" % next(ids) if word is None else word
					for word in sent] for _, sent in newfrags]
			newfrags, newcounts = iteratefragments(
					fragments, newtrees, newsents, trees, sents, numproc)
			if len(newfrags) == 0:
				break
			if trees is None:
				trees = []
				sents = []
			trees.extend(newtrees)
			sents.extend(newsents)
			fragmentkeys.extend(newfrags)
			counts.extend(newcounts)
			fragments.update(zip(newfrags, newcounts))
	logging.info("found %d fragments", len(fragmentkeys))
	return dict(zip(fragmentkeys, counts))
Exemplo n.º 15
0
def parse():
	""" Parse sentence and return a textual representation of a parse tree,
	in a HTML fragment or plain text. To be invoked by an AJAX call."""
	sent = request.args.get('sent', None)
	est = request.args.get('est', 'dop1')
	marg = request.args.get('marg', 'nbest')
	objfun = request.args.get('objfun', 'mpp')
	coarse = request.args.get('coarse', None)
	html = 'html' in request.args
	lang = request.args.get('lang', 'detect')
	if not sent:
		return ''
	frags = nbest = None
	senttok = tokenize(sent)
	if not senttok or not 1 <= len(senttok) <= LIMIT:
		return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
	if lang == 'detect':
		lang = guesslang(senttok)
	elif lang not in PARSERS:
		return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
	key = (senttok, est, marg, objfun, coarse, lang, html)
	if CACHE.get(key) is not None:
		return CACHE.get(key)
	link = url_encode(dict(sent=sent, est=est, marg=marg, objfun=objfun,
			coarse=coarse, html=html))
	PARSERS[lang].stages[-1].estimator = est
	PARSERS[lang].stages[-1].objective = objfun
	PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
	PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
	if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
		PARSERS[lang].stages[0].mode = coarse
		PARSERS[lang].stages[1].k = 1e-5 if coarse == 'pcfg-posterior' else 50

	results = list(PARSERS[lang].parse(senttok))
	if results[-1].noparse:
		parsetrees = {}
		result = 'no parse!'
		frags = nbest = ''
	else:
		if PARSERS[lang].relationalrealizational:
			treebank.handlefunctions('add', results[-1].parsetree, pos=True)
		tree = str(results[-1].parsetree)
		prob = results[-1].prob
		parsetrees = results[-1].parsetrees or {}
		parsetrees = heapq.nlargest(10, parsetrees.items(), key=itemgetter(1))
		fragments = results[-1].fragments or ()
		APP.logger.info('[%s] %s' % (probstr(prob), tree))
		tree = Tree.parse(tree, parse_leaf=int)
		result = Markup(DrawTree(tree, senttok, abbr=True).text(
				unicodelines=True, html=html))
		frags = Markup('Phrasal fragments used in the most probable derivation'
				' of the highest ranked parse tree:\n'
				+ '\n\n'.join(
				DrawTree(Tree.parse(frag, parse_leaf=int), terminals).text(
						unicodelines=True, html=html)
				for frag, terminals in fragments))
		nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob),
					DrawTree(PARSERS[lang].postprocess(tree)[0], senttok,
						abbr=True).text(unicodelines=True, html=html))
				for n, (tree, prob) in enumerate(parsetrees)))
	msg = '\n'.join(stage.msg for stage in results)
	elapsed = [stage.elapsedtime for stage in results]
	elapsed = 'CPU time elapsed: %s => %gs' % (
			' '.join('%gs' % a for a in elapsed), sum(elapsed))
	info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (
			len(senttok), lang, est, objfun, marg), msg, elapsed,
			'10 most probable parse trees:',
			'\n'.join('%d. [%s] %s' % (n + 1, probstr(prob), tree)
					for n, (tree, prob) in enumerate(parsetrees)) + '\n'))
	if html:
		CACHE.set(key, render_template('parsetree.html', sent=sent,
				result=result, frags=frags, nbest=nbest, info=info, link=link,
				randid=randid()), timeout=5000)
	else:
		CACHE.set(key, Response('\n'.join((nbest, frags, info, result)),
				mimetype='text/plain'), timeout=5000)
	return CACHE.get(key)
Exemplo n.º 16
0
def test_grammar(debug=False):
	"""Demonstrate grammar extraction."""
	from discodop.grammar import treebankgrammar, dopreduction, doubledop
	from discodop import plcfrs
	from discodop.containers import Grammar
	from discodop.treebank import NegraCorpusReader
	from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers
	from discodop.disambiguation import recoverfragments
	from discodop.kbest import lazykbest
	from math import exp
	corpus = NegraCorpusReader('alpinosample.export', punct='move')
	sents = list(corpus.sents().values())
	trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
			for a in list(corpus.trees().values())[:10]]
	if debug:
		print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
		print('dop reduction')
	grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
			start=trees[0].label)
	if debug:
		print(grammar)
	_ = grammar.testgrammar()

	grammarx, backtransform, _, _ = doubledop(trees, sents,
			debug=debug, numproc=1)
	if debug:
		print('\ndouble dop grammar')
	grammar = Grammar(grammarx, start=trees[0].label)
	grammar.getmapping(grammar, striplabelre=None,
			neverblockre=re.compile(b'^#[0-9]+|.+}<'),
			splitprune=False, markorigin=False)
	if debug:
		print(grammar)
	assert grammar.testgrammar()[0], "RFE should sum to 1."
	for tree, sent in zip(corpus.trees().values(), sents):
		if debug:
			print("sentence:", ' '.join(a.encode('unicode-escape').decode()
					for a in sent))
		chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
		if debug:
			print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
		if chart:
			mpp, parsetrees = {}, {}
			derivations, _ = lazykbest(chart, 1000, b'}<')
			for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
				r = Tree(recoverfragments(d.key, chart, backtransform))
				r = str(removefanoutmarkers(unbinarize(r)))
				mpp[r] = mpp.get(r, 0.0) + exp(-p)
				parsetrees.setdefault(r, []).append((t, p))
			if debug:
				print(len(mpp), 'parsetrees',
						sum(map(len, parsetrees.values())), 'derivations')
			for t, tp in sorted(mpp.items(), key=itemgetter(1)):
				if debug:
					print(tp, t, '\nmatch:', t == str(tree))
				if len(set(parsetrees[t])) != len(parsetrees[t]):
					print('chart:\n', chart)
					assert len(set(parsetrees[t])) == len(parsetrees[t])
				if debug:
					for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
						print(' <= %6g %s' % (exp(-p), deriv))
		elif debug:
			print('no parse\n', chart)
		if debug:
			print()
	tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
	Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemplo n.º 17
0
def test_grammar(debug=False):
    """Demonstrate grammar extraction."""
    from discodop.grammar import treebankgrammar, dopreduction, doubledop
    from discodop import plcfrs
    from discodop.containers import Grammar
    from discodop.treebank import NegraCorpusReader
    from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers
    from discodop.disambiguation import recoverfragments
    from discodop.kbest import lazykbest
    from math import exp
    corpus = NegraCorpusReader('alpinosample.export', punct='move')
    sents = list(corpus.sents().values())
    trees = [
        addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
        for a in list(corpus.trees().values())[:10]
    ]
    if debug:
        print('plcfrs\n', Grammar(treebankgrammar(trees, sents)))
        print('dop reduction')
    grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
                      start=trees[0].label)
    if debug:
        print(grammar)
    _ = grammar.testgrammar()

    grammarx, backtransform, _, _ = doubledop(trees,
                                              sents,
                                              debug=False,
                                              numproc=1)
    if debug:
        print('\ndouble dop grammar')
    grammar = Grammar(grammarx, start=trees[0].label)
    grammar.getmapping(grammar,
                       striplabelre=None,
                       neverblockre=re.compile('^#[0-9]+|.+}<'),
                       splitprune=False,
                       markorigin=False)
    if debug:
        print(grammar)
    assert grammar.testgrammar()[0], "RFE should sum to 1."
    for tree, sent in zip(corpus.trees().values(), sents):
        if debug:
            print("sentence:",
                  ' '.join(a.encode('unicode-escape').decode() for a in sent))
        chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
        if debug:
            print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='')
        if chart:
            mpp, parsetrees = {}, {}
            derivations, _ = lazykbest(chart, 1000, '}<')
            for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
                r = Tree(recoverfragments(d.key, chart, backtransform))
                r = str(removefanoutmarkers(unbinarize(r)))
                mpp[r] = mpp.get(r, 0.0) + exp(-p)
                parsetrees.setdefault(r, []).append((t, p))
            if debug:
                print(len(mpp), 'parsetrees',
                      sum(map(len, parsetrees.values())), 'derivations')
            for t, tp in sorted(mpp.items(), key=itemgetter(1)):
                if debug:
                    print(tp, t, '\nmatch:', t == str(tree))
                if len(set(parsetrees[t])) != len(parsetrees[t]):
                    print('chart:\n', chart)
                    assert len(set(parsetrees[t])) == len(parsetrees[t])
                if debug:
                    for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
                        print(' <= %6g %s' % (exp(-p), deriv))
        elif debug:
            print('no parse\n', chart)
        if debug:
            print()
    tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
    Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemplo n.º 18
0
def parse():
    """Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
    sent = request.args.get('sent', None)
    objfun = request.args.get('objfun', 'mpp')
    est = request.args.get('est', 'rfe')
    marg = request.args.get('marg', 'nbest')
    coarse = request.args.get('coarse', 'pcfg')
    html = 'html' in request.args
    lang = request.args.get('lang', 'detect')
    require = request.args.get('require', None)
    block = request.args.get('block', None)
    if not sent:
        return ''
    nbest = None
    if POSTAGS.match(sent):
        senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split()))
    else:
        senttok, tags = tuple(tokenize(sent)), None
    if not senttok or not 1 <= len(senttok) <= LIMIT:
        return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
    if lang == 'detect':
        lang = guesslang(senttok)
    elif lang not in PARSERS:
        return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
    if require:
        require = tuple((label, tuple(indices))
                        for label, indices in sorted(json.loads(require)))
    if block:
        block = tuple((label, tuple(indices))
                      for label, indices in sorted(json.loads(block)))
    key = (senttok, tags, est, marg, objfun, coarse, lang, require, block)
    resp = CACHE.get(key)
    if resp is None:
        urlparams = dict(sent=sent,
                         lang=lang,
                         est=est,
                         marg=marg,
                         objfun=objfun,
                         coarse=coarse,
                         html=html)
        if require:
            urlparams['require'] = json.dumps(require)
        if block:
            urlparams['block'] = json.dumps(block)
        link = '?' + url_encode(urlparams)
        PARSERS[lang].stages[-1].estimator = est
        PARSERS[lang].stages[-1].objective = objfun
        PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
        PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
        if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
            PARSERS[lang].stages[0].mode = ('pcfg' if coarse
                                            == 'pcfg-posterior' else coarse)
            if len(PARSERS[lang].stages) > 1:
                PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior'
                                             else 50)
        results = list(PARSERS[lang].parse(senttok,
                                           tags=tags,
                                           require=require,
                                           block=block))
        if SHOWMORPH:
            replacemorph(results[-1].parsetree)
        if SHOWFUNC:
            treebank.handlefunctions('add', results[-1].parsetree, pos=True)
        tree = str(results[-1].parsetree)
        prob = results[-1].prob
        parsetrees = results[-1].parsetrees or []
        parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
        parsetrees_ = []
        LOG.info('[%s] %s', probstr(prob), tree)
        tree = Tree.parse(tree, parse_leaf=int)
        result = Markup(
            DrawTree(tree, senttok).text(unicodelines=True,
                                         html=html,
                                         funcsep='-'))
        for tree, prob, x in parsetrees:
            tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
            if SHOWMORPH:
                replacemorph(tree)
            if SHOWFUNC:
                treebank.handlefunctions('add', tree, pos=True)
            parsetrees_.append((tree, prob, x))
        if PARSERS[lang].headrules:
            xtree = PARSERS[lang].postprocess(parsetrees[0][0], senttok, -1)[0]
            dep = treebank.writedependencies(xtree, senttok, 'conll')
            depsvg = Markup(DrawDependencies.fromconll(dep).svg())
        else:
            dep = depsvg = ''
        rid = randid()
        nbest = Markup('\n\n'.join(
            '%d. [%s] '
            '<a href=\'javascript: toggle("f%s%d"); \'>'
            'derivation</a>\n'
            '<span id=f%s%d style="display: none; margin-left: 3em; ">'
            'Fragments used in the highest ranked derivation'
            ' of this parse tree:\n%s</span>\n%s' % (
                n + 1,
                probstr(prob),
                rid,
                n + 1,
                rid,
                n + 1,
                '\n\n'.join(
                    '%s\n%s' %
                    (w, DrawTree(frag).text(unicodelines=True, html=html))
                    for frag, w in fragments or ()  # if frag.count('(') > 1
                ),
                DrawTree(tree, senttok).text(
                    unicodelines=True, html=html, funcsep='-'))
            for n, (tree, prob, fragments) in enumerate(parsetrees_)))
        deriv = Markup(
            'Fragments used in the highest ranked derivation'
            ' of best parse tree:\n%s' % (
                '\n\n'.join(
                    '%s\n%s' %
                    (w, DrawTree(frag).text(unicodelines=True, html=html))
                    for frag, w in parsetrees_[0][2] or ()
                    # if frag.count('(') > 1
                ))) if parsetrees_ else ''
        msg = '\n'.join(stage.msg for stage in results)
        elapsed = [stage.elapsedtime for stage in results]
        elapsed = 'CPU time elapsed: %s => %gs' % (' '.join(
            '%gs' % a for a in elapsed), sum(elapsed))
        info = '\n'.join((
            'length: %d; lang=%s; est=%s; objfun=%s; marg=%s' %
            (len(senttok), lang, est, objfun, marg), msg, elapsed,
            '10 most probable parse trees:',
            ''.join('%d. [%s] %s' %
                    (n + 1, probstr(prob), writediscbrackettree(tree, senttok))
                    for n, (tree, prob, _) in enumerate(parsetrees)) + '\n'))
        CACHE.set(key, (sent, result, nbest, deriv, info, link, dep, depsvg),
                  timeout=5000)
    else:
        (sent, result, nbest, deriv, info, link, dep, depsvg) = resp
    if html:
        return render_template('parsetree.html',
                               sent=sent,
                               result=result,
                               nbest=nbest,
                               deriv=deriv,
                               info=info,
                               link=link,
                               dep=dep,
                               depsvg=depsvg,
                               randid=randid())
    else:
        return Response('\n'.join((nbest, info, result)),
                        mimetype='text/plain')
Exemplo n.º 19
0
def parse():
	"""Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
	sent = request.args.get('sent', None)
	est = request.args.get('est', 'rfe')
	marg = request.args.get('marg', 'nbest')
	objfun = request.args.get('objfun', 'mpp')
	coarse = request.args.get('coarse', None)
	html = 'html' in request.args
	lang = request.args.get('lang', 'detect')
	if not sent:
		return ''
	frags = nbest = None
	senttok = tokenize(sent)
	if not senttok or not 1 <= len(senttok) <= LIMIT:
		return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
	if lang == 'detect':
		lang = guesslang(senttok)
	elif lang not in PARSERS:
		return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
	key = (senttok, est, marg, objfun, coarse, lang)
	resp = CACHE.get(key)
	if resp is None:
		link = 'parse?' + url_encode(dict(sent=sent, est=est, marg=marg,
				objfun=objfun, coarse=coarse, html=html))
		PARSERS[lang].stages[-1].estimator = est
		PARSERS[lang].stages[-1].objective = objfun
		PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
		PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
		if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
			PARSERS[lang].stages[0].mode = coarse
			PARSERS[lang].stages[1].k = (1e-5
					if coarse == 'pcfg-posterior' else 50)

		results = list(PARSERS[lang].parse(senttok))
		if results[-1].noparse:
			parsetrees = []
			result = 'no parse!'
			frags = nbest = ''
		else:
			if SHOWMORPH:
				for node in results[-1].parsetree.subtrees(
						lambda n: n and not isinstance(n[0], Tree)):
					treebank.handlemorphology(
							'replace', None, node, node.source)
					node.label = node.label.replace('[]', '')
			if SHOWFUNC:
				treebank.handlefunctions('add', results[-1].parsetree, pos=True)
			tree = str(results[-1].parsetree)
			prob = results[-1].prob
			parsetrees = results[-1].parsetrees or []
			parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
			parsetrees_ = []
			fragments = results[-1].fragments or ()
			APP.logger.info('[%s] %s', probstr(prob), tree)
			tree = Tree.parse(tree, parse_leaf=int)
			result = Markup(DrawTree(tree, senttok).text(
					unicodelines=True, html=html, funcsep='-'))
			frags = Markup('Phrasal fragments used in the most probable '
					'derivation of the highest ranked parse tree:\n'
					+ '\n\n'.join(
					DrawTree(frag).text(unicodelines=True, html=html)
					for frag in fragments if frag.count('(') > 1))
			for tree, prob, x in parsetrees:
				tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
				if SHOWMORPH:
					for node in tree.subtrees(
							lambda n: n and not isinstance(n[0], Tree)):
						treebank.handlemorphology(
								'replace', None, node, node.source)
				if SHOWFUNC:
					treebank.handlefunctions('add', tree, pos=True)
				parsetrees_.append((tree, prob, x))
			nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob),
						DrawTree(tree, senttok).text(
							unicodelines=True, html=html, funcsep='-'))
					for n, (tree, prob, _) in enumerate(parsetrees_)))
		msg = '\n'.join(stage.msg for stage in results)
		elapsed = [stage.elapsedtime for stage in results]
		elapsed = 'CPU time elapsed: %s => %gs' % (
				' '.join('%gs' % a for a in elapsed), sum(elapsed))
		info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (
				len(senttok), lang, est, objfun, marg), msg, elapsed,
				'10 most probable parse trees:',
				'\n'.join('%d. [%s] %s' % (n + 1, probstr(prob),
						writediscbrackettree(tree, senttok))
						for n, (tree, prob, _) in enumerate(parsetrees))
				+ '\n'))
		CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000)
	else:
		(sent, result, frags, nbest,  # pylint: disable=unpacking-non-sequence
				info, link) = resp  # pylint: disable=unpacking-non-sequence
	if html:
		return render_template('parsetree.html', sent=sent, result=result,
				frags=frags, nbest=nbest, info=info, link=link,
				randid=randid())
	else:
		return Response('\n'.join((nbest, frags, info, result)),
				mimetype='text/plain')
Exemplo n.º 20
0
def trees(form):
	""" Return visualization of parse trees in search results. """
	# TODO: show context of x sentences around result, offer pagination.
	gotresults = False
	for n, (_textno, results, stderr) in enumerate(
			doqueries(form, lines=True)):
		if n == 0:
			# NB: we do not hide function or morphology tags when exporting
			url = 'trees?query=%s&texts=%s&engine=%s&export=1' % (
					quote(form['query']), form['texts'],
					form.get('engine', 'tgrep2'))
			yield ('Query: %s\n'
					'Trees (showing up to %d per text; '
					'export: <a href="%s">plain</a>, '
					'<a href="%s">with line numbers</a>):\n' % (
						stderr, TREELIMIT, url, url + '&linenos=1'))
		for m, line in enumerate(islice(results, TREELIMIT)):
			lineno, text, treestr, match = line.split(":::")
			if m == 0:
				gotresults = True
				yield ("==&gt; %s: [<a href=\"javascript: toggle('n%d'); \">"
						"toggle</a>]\n<span id=n%d>" % (text, n + 1, n + 1))
			if form.get('engine', 'tgrep2') == 'tgrep2':
				cnt = count()
				treestr = treestr.replace(" )", " -NONE-)")
				match = match.strip()
				if match.startswith('('):
					treestr = treestr.replace(match, '%s_HIGH %s' % tuple(
							match.split(None, 1)))
				else:
					match = ' %s)' % match
					treestr = treestr.replace(match, '_HIGH%s' % match)
				tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt))
				sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr)
				high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH")))
				if high:
					high = high.pop()
					high.label = high.label.rsplit("_", 1)[0]
					high = list(high.subtrees()) + high.leaves()
			elif form.get('engine', 'tgrep2') == 'xpath':
				tree, sent = treebank.alpinotree(
						ElementTree.fromstring(treestr))
						# morphology='replace')
				highwords = re.findall('<node[^>]*begin="([0-9]+)"[^>]*/>',
						match)
				high = set(re.findall(r'\bid="(.+?)"', match))
				high = list(tree.subtrees(lambda n:
						n.source[treebank.PARENT] in high or
						n.source[treebank.WORD].lstrip('#') in high))
				high += [int(a) for a in highwords]
			try:
				treerepr = DrawTree(tree, sent, highlight=high).text(
						unicodelines=True, html=True)
			except ValueError as err:
				line = "#%s \nERROR: %s\n%s\n%s\n" % (
						lineno, err, treestr, tree)
			else:
				line = "#%s\n%s\n" % (lineno, treerepr)
			yield line
		yield "</span>"
	if not gotresults:
		yield "No matches."
Exemplo n.º 21
0
def test():
	"""Do some tests."""
	trees = '''(ROOT (S (ADV 0) (VVFIN 1) (NP (PDAT 2) (NN 3)) (PTKNEG 4) \
				(PP (APPRART 5) (NN 6) (NP (ART 7) (ADJA 8) (NN 9)))) ($. 10))
			(S (NP (NN 1) (EX 3)) (VP (VB 0) (JJ 2)))
			(S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1))
			(top (du (comp 0) (smain (noun 1) (verb 2) (inf (verb 8) (inf \
				(adj 3) (pp (prep 4) (np (det 5) (noun 6))) (part 7) (verb 9) \
				(pp (prep 10) (np (det 11) (noun 12) (pp (prep 13) (mwu \
				(noun 14) (noun 15))))))))) (punct 16))
			(top (smain (noun 0) (verb 1) (inf (verb 5) (inf (np (det 2) \
				(adj 3) (noun 4)) (verb 6) (pp (prep 7) (noun 8))))) (punct 9))
			(top (smain (noun 0) (verb 1) (noun 2) (inf (adv 3) (verb 4))) \
				(punct 5))
			(top (punct 5) (du (smain (noun 0) (verb 1) (ppart (np (det 2) \
				(noun 3)) (verb 4))) (conj (sv1 (conj (noun 6) (vg 7) (np \
				(det 8) (noun 9))) (verb 10) (noun 11) (part 12)) (vg 13) \
				(sv1 (verb 14) (ti (comp 19) (inf (np (conj (det 15) (vg 16) \
				(det 17)) (noun 18)) (verb 20)))))) (punct 21))
			(top (punct 10) (punct 16) (punct 18) (smain (np (det 0) (noun 1) \
				(pp (prep 2) (np (det 3) (noun 4)))) (verb 5) (adv 6) (np \
				(noun 7) (noun 8)) (part 9) (np (det 11) (noun 12) (pp \
				(prep 13) (np (det 14) (noun 15)))) (conj (vg 20) (ppres \
				(adj 17) (pp (prep 22) (np (det 23) (adj 24) (noun 25)))) \
				(ppres (adj 19)) (ppres (adj 21)))) (punct 26))
			(top (punct 10) (punct 11) (punct 16) (smain (np (det 0) \
				(noun 1)) (verb 2) (np (det 3) (noun 4)) (adv 5) (du (cp \
				(comp 6) (ssub (noun 7) (verb 8) (inf (verb 9)))) (du \
				(smain (noun 12) (verb 13) (adv 14) (part 15)) (noun 17)))) \
				(punct 18) (punct 19))
			(top (smain (noun 0) (verb 1) (inf (verb 8) (inf (verb 9) (inf \
				(adv 2) (pp (prep 3) (noun 4)) (pp (prep 5) (np (det 6) \
				(noun 7))) (verb 10))))) (punct 11))
			(top (smain (noun 0) (verb 1) (pp (prep 2) (np (det 3) (adj 4) \
				(noun 5) (rel (noun 6) (ssub (noun 7) (verb 10) (ppart \
				(adj 8) (part 9) (verb 11))))))) (punct 12))
			(top (smain (np (det 0) (noun 1)) (verb 2) (ap (adv 3) (num 4) \
				(cp (comp 5) (np (det 6) (adj 7) (noun 8) (rel (noun 9) (ssub \
				(noun 10) (verb 11) (pp (prep 12) (np (det 13) (adj 14) \
				(adj 15) (noun 16))))))))) (punct 17))
			(top (smain (np (det 0) (noun 1)) (verb 2) (adv 3) (pp (prep 4) \
				(np (det 5) (noun 6)) (part 7))) (punct 8))
			(top (punct 7) (conj (smain (noun 0) (verb 1) (np (det 2) \
				(noun 3)) (pp (prep 4) (np (det 5) (noun 6)))) (smain \
				(verb 8) (np (det 9) (num 10) (noun 11)) (part 12)) (vg 13) \
				(smain (verb 14) (noun 15) (pp (prep 16) (np (det 17) \
				(noun 18) (pp (prep 19) (np (det 20) (noun 21))))))) \
				(punct 22))
			(top (smain (np (det 0) (noun 1) (rel (noun 2) (ssub (np (num 3) \
				(noun 4)) (adj 5) (verb 6)))) (verb 7) (ppart (verb 8) (pp \
				(prep 9) (noun 10)))) (punct 11))
			(top (conj (sv1 (np (det 0) (noun 1)) (verb 2) (ppart (verb 3))) \
				(vg 4) (sv1 (verb 5) (pp (prep 6) (np (det 7) (adj 8) \
				(noun 9))))) (punct 10))
			(top (smain (noun 0) (verb 1) (np (det 2) (noun 3)) (inf (adj 4) \
				(verb 5) (cp (comp 6) (ssub (noun 7) (adv 8) (verb 10) (ap \
				(num 9) (cp (comp 11) (np (det 12) (adj 13) (noun 14) (pp \
				(prep 15) (conj (np (det 16) (noun 17)) (vg 18) (np \
				(noun 19))))))))))) (punct 20))
			(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) \
				(inf (verb 6) (conj (inf (pp (prep 2) (np (det 3) (noun 4))) \
				(verb 7)) (inf (verb 9)) (vg 10) (inf (verb 11)))))) \
				(punct 12))
			(top (smain (verb 2) (noun 3) (adv 4) (ppart (np (det 0) \
				(noun 1)) (verb 5))) (punct 6))
			(top (conj (smain (np (det 0) (noun 1)) (verb 2) (adj 3) (pp \
				(prep 4) (np (det 5) (noun 6)))) (vg 7) (smain (np (det 8) \
				(noun 9) (pp (prep 10) (np (det 11) (noun 12)))) (verb 13) \
				(pp (prep 14) (np (det 15) (noun 16))))) (punct 17))
			(top (conj (smain (noun 0) (verb 1) (inf (ppart (np (noun 2) \
				(noun 3)) (verb 4)) (verb 5))) (vg 6) (smain (noun 7) \
				(inf (ppart (np (det 8) (noun 9)))))) (punct 10))
			(A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10))  (B3 (t 1) \
				(t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8)))
			(A (B1 6 13) (B2 3 7 10)  (B3 1 \
				9 11 14 16) (B4 0 5 8))
			(VP (VB 0) (PRT 2))
			(VP (VP 0 3) (NP (PRP 1) (NN 2)))
			(ROOT (S (VP_2 (PP (APPR 0) (ART 1) (NN 2) (PP (APPR 3) (ART 4) \
				(ADJA 5) (NN 6))) (ADJD 10) (PP (APPR 11) (NN 12)) (VVPP 13)) \
				(VAFIN 7) (NP (ART 8) (NN 9))) ($. 14))'''
	sents = '''Leider stehen diese Fragen nicht im Vordergrund der \
				augenblicklichen Diskussion .
			is Mary happy there
			das muss man jetzt machen
			Of ze had gewoon met haar vriendinnen rond kunnen slenteren in de \
				buurt van Trafalgar Square .
			Het had een prachtige dag kunnen zijn in Londen .
			Cathy zag hen wild zwaaien .
			Het was een spel geworden , zij en haar vriendinnen kozen iemand \
				uit en probeerden zijn of haar nationaliteit te raden .
			Elk jaar in het hoogseizoen trokken daar massa's toeristen \
				voorbij , hun fototoestel in de aanslag , pratend , gillend \
				en lachend in de vreemdste talen .
			Haar vader stak zijn duim omhoog alsof hij wilde zeggen : " het \
				komt wel goed , joch " .
			Ze hadden languit naast elkaar op de strandstoelen kunnen gaan \
				liggen .
			Het hoorde bij de warme zomerdag die ze ginds achter had gelaten .
			De oprijlaan was niet meer dan een hobbelige zandstrook die zich \
				voortslingerde tussen de hoge grijze boomstammen .
			Haar moeder kleefde bijna tegen het autoraampje aan .
			Ze veegde de tranen uit haar ooghoeken , tilde haar twee koffers \
				op en begaf zich in de richting van het landhuis .
			Het meisje dat vijf keer juist raadde werd getrakteerd op ijs .
			Haar neus werd platgedrukt en leek op een jonge champignon .
			Cathy zag de BMW langzaam verdwijnen tot hij niet meer was dan \
				een zilveren schijnsel tussen de bomen en struiken .
			Ze had met haar moeder kunnen gaan winkelen , zwemmen of \
				terrassen .
			Dat werkwoord had ze zelf uitgevonden .
			De middagzon hing klein tussen de takken en de schaduwen van de \
				wolken drentelden over het gras .
			Zij zou mams rug ingewreven hebben en mam de hare .
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
			Mit einer Messe in der Sixtinischen Kapelle ist das Konklave \
				offiziell zu Ende gegangen .'''
	trees = [Tree.parse(a, parse_leaf=int) for a in trees.splitlines()]
	sents = [a.split() for a in sents.splitlines()]
	sents.extend([['Wake', None, 'up'],
		[None, 'your', 'friend', None]])
	for n, (tree, sent) in enumerate(zip(trees, sents)):
		drawtree = DrawTree(tree, sent)
		print('\ntree, sent', n, tree,
				' '.join('...' if a is None else a for a in sent),
				repr(drawtree),
				sep='\n')
		try:
			print(drawtree.text(unicodelines=True, ansi=True), sep='\n')
		except (UnicodeDecodeError, UnicodeEncodeError):
			print(drawtree.text(unicodelines=False, ansi=False), sep='\n')
Exemplo n.º 22
0
def bitext():
	""" Bitext parsing with a synchronous CFG.
	Translation would require a special decoder (instead of normal kbest
	derivations where the whole sentence is given). """
	print("bitext parsing with a synchronous CFG")
	trees = [Tree.parse(a, parse_leaf=int) for a in """\
	(ROOT (S (NP (NNP (John 0) (John 7))) (VP (VB (misses 1) (manque 5))\
		(PP (IN (a` 6)) (NP (NNP (Mary 2) (Mary 4)))))) (SEP (| 3)))
	(ROOT (S (NP (NNP (Mary 0) (Mary 4))) (VP (VB (likes 1) (aimes 5))\
		(NP (DT (la 6)) (NN (pizza 2) (pizza 7))))) (SEP (| 3)))""".split('\n')]
	sents = [["0"] * len(a.leaves()) for a in trees]
	for a in trees:
		treetransforms.binarize(a)
	compiled_scfg = Grammar(treebankgrammar(trees, sents))
	print("sentences:")
	for tree in trees:
		print(' '.join(w for _, w in sorted(tree.pos())))
	print("treebank:")
	for tree in trees:
		print(tree)
	print(compiled_scfg, "\n")

	print("correct translations:")
	assert parse(compiled_scfg, ["0"] * 7,
			"John likes Mary | John aimes Mary".split())
	assert parse(compiled_scfg, ["0"] * 9,
			"John misses pizza | la pizza manque a` John".split())

	print("incorrect translations:")
	assert not parse(compiled_scfg, ["0"] * 7,
			"John likes Mary | Mary aimes John".split())
	assert not parse(compiled_scfg, ["0"] * 9,
			"John misses pizza | John manque a` la pizza".split())

	# the following SCFG is taken from:
	# http://cdec-decoder.org/index.php?title=SCFG_translation
	# the grammar has been binarized and some new non-terminals had to be
	# introduced because terminals cannot appear in binary rules.
	lexicon = ("|", "ein", "ich", "Haus", "kleines", "grosses", "sah", "fand",
		"small", "little", "big", "large", "house", "shell", "a", "I",
		"saw", "found")
	another_scfg = Grammar([
			((('DT', '_ein', '_a'), ((0, ), (1, ))), 0.5),
			((('JJ', '_kleines', '_small'), ((0, ), (1, ))), 0.1),
			((('JJ', '_kleines', '_little'), ((0, ), (1, ))), 0.9),
			((('JJ', '_grosses', '_big'), ((0, ), (1, ))), 0.8),
			((('JJ', '_grosses', '_large'), ((0, ), (1, ))), 0.2345),
			((('NN_house', '_Haus', '_house'), ((0, ), (1, ))), 1),
			((('NN_shell', '_Haus', '_shell'), ((0, ), (1, ))), 1),
			((('NP', '_ich', '_I'), ((0, ), (1, ), )), 0.6),
			((('NP', 'DT', 'NP|<JJ-NN>'), ((0, 1), (0, 1))), 0.5),
			((('NP|<JJ-NN>', 'JJ', 'NN_house'), ((0, 1), (0, 1))), 0.1),
			((('NP|<JJ-NN>', 'JJ', 'NN_shell'), ((0, 1), (0, 1))), 1.3),
			((('ROOT', 'S', '_|'), ((0, 1, 0), )), 1),
			((('S', 'NP', 'VP'), ((0, 1), (0, 1))), 0.2),
			((('VP', 'V', 'NP'), ((0, 1), (0, 1))), 0.1),
			((('V', '_sah', '_saw'), ((0, ), (1, ))), 0.4),
			((('V', '_fand', '_found'), ((0, ), (1, ))), 0.4)]
			+ [((('_%s' % word, 'Epsilon'), (word, )), 1)
					for word in lexicon])
	print(another_scfg)
	sents = [
		"ich sah ein kleines Haus | I saw a small house".split(),
		"ich sah ein kleines Haus | I saw a little house".split(),
		"ich sah ein kleines Haus | I saw a small shell".split(),
		"ich sah ein kleines Haus | I saw a little shell".split()]
	for sent in sents:
		assert parse(another_scfg, sent), sent
Exemplo n.º 23
0
def parse():
    """Parse sentence and return a textual representation of a parse tree.

	Output is either in a HTML fragment or in plain text. To be invoked by an
	AJAX call."""
    sent = request.args.get('sent', None)
    est = request.args.get('est', 'rfe')
    marg = request.args.get('marg', 'nbest')
    objfun = request.args.get('objfun', 'mpp')
    coarse = request.args.get('coarse', None)
    html = 'html' in request.args
    lang = request.args.get('lang', 'detect')
    if not sent:
        return ''
    frags = nbest = None
    senttok = tokenize(sent)
    if not senttok or not 1 <= len(senttok) <= LIMIT:
        return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT)
    if lang == 'detect':
        lang = guesslang(senttok)
    elif lang not in PARSERS:
        return 'unknown language %r; languages: %r' % (lang, PARSERS.keys())
    key = (senttok, est, marg, objfun, coarse, lang)
    resp = CACHE.get(key)
    if resp is None:
        link = 'parse?' + url_encode(
            dict(sent=sent,
                 est=est,
                 marg=marg,
                 objfun=objfun,
                 coarse=coarse,
                 html=html))
        PARSERS[lang].stages[-1].estimator = est
        PARSERS[lang].stages[-1].objective = objfun
        PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both')
        PARSERS[lang].stages[-1].sample = marg in ('sample', 'both')
        if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse:
            PARSERS[lang].stages[0].mode = coarse
            PARSERS[lang].stages[1].k = (1e-5
                                         if coarse == 'pcfg-posterior' else 50)

        results = list(PARSERS[lang].parse(senttok))
        if results[-1].noparse:
            parsetrees = []
            result = 'no parse!'
            frags = nbest = ''
        else:
            if SHOWMORPH:
                replacemorph(results[-1].parsetree)
            if SHOWFUNC:
                treebank.handlefunctions('add',
                                         results[-1].parsetree,
                                         pos=True)
            tree = str(results[-1].parsetree)
            prob = results[-1].prob
            parsetrees = results[-1].parsetrees or []
            parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1))
            parsetrees_ = []
            fragments = results[-1].fragments or ()
            APP.logger.info('[%s] %s', probstr(prob), tree)
            tree = Tree.parse(tree, parse_leaf=int)
            result = Markup(
                DrawTree(tree, senttok).text(unicodelines=True,
                                             html=html,
                                             funcsep='-'))
            frags = Markup(
                'Phrasal fragments used in the most probable '
                'derivation of the highest ranked parse tree:\n' + '\n\n'.join(
                    DrawTree(frag).text(unicodelines=True, html=html)
                    for frag in fragments if frag.count('(') > 1))
            for tree, prob, x in parsetrees:
                tree = PARSERS[lang].postprocess(tree, senttok, -1)[0]
                if SHOWMORPH:
                    replacemorph(tree)
                if SHOWFUNC:
                    treebank.handlefunctions('add', tree, pos=True)
                parsetrees_.append((tree, prob, x))
            nbest = Markup('\n\n'.join(
                '%d. [%s]\n%s' %
                (n + 1, probstr(prob), DrawTree(tree, senttok).text(
                    unicodelines=True, html=html, funcsep='-'))
                for n, (tree, prob, _) in enumerate(parsetrees_)))
        msg = '\n'.join(stage.msg for stage in results)
        elapsed = [stage.elapsedtime for stage in results]
        elapsed = 'CPU time elapsed: %s => %gs' % (' '.join(
            '%gs' % a for a in elapsed), sum(elapsed))
        info = '\n'.join(
            ('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' %
             (len(senttok), lang, est, objfun, marg), msg, elapsed,
             '10 most probable parse trees:', '\n'.join(
                 '%d. [%s] %s' %
                 (n + 1, probstr(prob), writediscbrackettree(tree, senttok))
                 for n, (tree, prob, _) in enumerate(parsetrees)) + '\n'))
        CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000)
    else:
        (
            sent,
            result,
            frags,
            nbest,  # pylint: disable=unpacking-non-sequence
            info,
            link) = resp  # pylint: disable=unpacking-non-sequence
    if html:
        return render_template('parsetree.html',
                               sent=sent,
                               result=result,
                               frags=frags,
                               nbest=nbest,
                               info=info,
                               link=link,
                               randid=randid())
    else:
        return Response('\n'.join((nbest, frags, info, result)),
                        mimetype='text/plain')
Exemplo n.º 24
0
def test():
	""" Run some tests. """
	from discodop import plcfrs
	from discodop.containers import Grammar
	from discodop.treebank import NegraCorpusReader
	from discodop.treetransforms import binarize, unbinarize, \
			addfanoutmarkers, removefanoutmarkers
	from discodop.disambiguation import recoverfragments
	from discodop.kbest import lazykbest
	from discodop.fragments import getfragments
	logging.basicConfig(level=logging.DEBUG, format='%(message)s')
	filename = "alpinosample.export"
	corpus = NegraCorpusReader('.', filename, punct='move')
	sents = list(corpus.sents().values())
	trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1))
			for a in list(corpus.parsed_sents().values())[:10]]

	print('plcfrs')
	lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label)
	print(lcfrs)

	print('dop reduction')
	grammar = Grammar(dopreduction(trees[:2], sents[:2])[0],
			start=trees[0].label)
	print(grammar)
	grammar.testgrammar()

	fragments = getfragments(trees, sents, 1)
	debug = '--debug' in sys.argv
	grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug)
	print('\ndouble dop grammar')
	grammar = Grammar(grammarx, start=trees[0].label)
	grammar.getmapping(grammar, striplabelre=None,
			neverblockre=re.compile(b'^#[0-9]+|.+}<'),
			splitprune=False, markorigin=False)
	print(grammar)
	assert grammar.testgrammar(), "DOP1 should sum to 1."
	for tree, sent in zip(corpus.parsed_sents().values(), sents):
		print("sentence:", ' '.join(a.encode('unicode-escape').decode()
				for a in sent))
		chart, msg = plcfrs.parse(sent, grammar, exhaustive=True)
		print('\n', msg, end='')
		print("\ngold ", tree)
		print("double dop", end='')
		if chart:
			mpp = {}
			parsetrees = {}
			derivations, _ = lazykbest(chart, 1000, b'}<')
			for d, (t, p) in zip(chart.rankededges[chart.root()], derivations):
				r = Tree(recoverfragments(d.getkey(), chart,
					grammar, backtransform))
				r = str(removefanoutmarkers(unbinarize(r)))
				mpp[r] = mpp.get(r, 0.0) + exp(-p)
				parsetrees.setdefault(r, []).append((t, p))
			print(len(mpp), 'parsetrees', end='')
			print(sum(map(len, parsetrees.values())), 'derivations')
			for t, tp in sorted(mpp.items(), key=itemgetter(1)):
				print(tp, '\n', t, end='')
				print("match:", t == str(tree))
				assert len(set(parsetrees[t])) == len(parsetrees[t])
				if not debug:
					continue
				for deriv, p in sorted(parsetrees[t], key=itemgetter(1)):
					print(' <= %6g %s' % (exp(-p), deriv))
		else:
			print("no parse")
			print(chart)
		print()
	tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int)
	Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
Exemplo n.º 25
0
def getfragments(trees, sents, numproc=1, disc=True,
		iterate=False, complement=False, indices=True, cover=True):
	"""Get recurring fragments with exact counts in a single treebank.

	:returns: a dictionary whose keys are fragments as strings, and
		indices as values. When ``disc`` is ``True``, keys are of the form
		``(frag, sent)`` where ``frag`` is a unicode string, and ``sent``
		is a list of words as unicode strings; when ``disc`` is ``False``, keys
		are of the form ``frag`` where ``frag`` is a unicode string.
	:param trees: a sequence of binarized Tree objects.
	:param numproc: number of processes to use; pass 0 to use detected # CPUs.
	:param disc: when disc=True, assume trees with discontinuous constituents.
	:param iterate, complement: see :func:`_fragments.extractfragments`"""
	if numproc == 0:
		numproc = cpu_count()
	numtrees = len(trees)
	if not numtrees:
		raise ValueError('no trees.')
	mult = 1  # 3 if numproc > 1 else 1
	fragments = {}
	trees = trees[:]
	work = workload(numtrees, mult, numproc)
	PARAMS.update(disc=disc, indices=indices, approx=False, complete=False,
			complement=complement, debug=False, adjacent=False, twoterms=False)
	initworkersimple(trees, list(sents), disc)
	if numproc == 1:
		mymap = map
		myapply = APPLY
	else:
		logging.info("work division:\n%s", "\n".join("    %s: %r" % kv
				for kv in sorted(dict(numchunks=len(work),
					numproc=numproc).items())))
		# start worker processes
		pool = Pool(processes=numproc, initializer=initworkersimple,
				initargs=(trees, list(sents), disc))
		mymap = pool.map
		myapply = pool.apply
	# collect recurring fragments
	logging.info("extracting recurring fragments")
	for a in mymap(worker, work):
		fragments.update(a)
	# add 'cover' fragments corresponding to single productions
	if cover:
		cover = myapply(coverfragworker, ())
		before = len(fragments)
		fragments.update(cover)
		logging.info("merged %d unseen cover fragments", len(fragments) - before)
	fragmentkeys = list(fragments)
	bitsets = [fragments[a] for a in fragmentkeys]
	countchunk = len(bitsets) // numproc + 1
	work = list(range(0, len(bitsets), countchunk))
	work = [(n, len(work), bitsets[a:a + countchunk])
			for n, a in enumerate(work)]
	logging.info("getting exact counts for %d fragments", len(bitsets))
	counts = []
	for a in mymap(exactcountworker, work):
		counts.extend(a)
	if numproc != 1:
		pool.close()
		pool.join()
		del pool
	if iterate:  # optionally collect fragments of fragments
		logging.info("extracting fragments of recurring fragments")
		PARAMS['complement'] = False  # needs to be turned off if it was on
		newfrags = fragments
		trees, sents = None, None
		ids = count()
		for _ in range(10):  # up to 10 iterations
			newtrees = [binarize(
					introducepreterminals(Tree.parse(tree, parse_leaf=int),
					ids=ids), childchar="}") for tree, _ in newfrags]
			newsents = [["#%d" % next(ids) if word is None else word
					for word in sent] for _, sent in newfrags]
			newfrags, newcounts = iteratefragments(
					fragments, newtrees, newsents, trees, sents, numproc)
			if len(newfrags) == 0:
				break
			if trees is None:
				trees = []
				sents = []
			trees.extend(newtrees)
			sents.extend(newsents)
			fragmentkeys.extend(newfrags)
			counts.extend(newcounts)
			fragments.update(zip(newfrags, newcounts))
	logging.info("found %d fragments", len(fragmentkeys))
	if not disc:
		return {a.decode('utf-8'): b for a, b in zip(fragmentkeys, counts)}
	return {(a.decode('utf-8'), b): c
			for (a, b), c in zip(fragmentkeys, counts)}