예제 #1
0
파일: parser.py 프로젝트: arne-cl/disco-dop
def readgrammars(resultdir, stages, postagging=None, top='ROOT'):
	""" Read the grammars from a previous experiment.
	Expects a directory 'resultdir' which contains the relevant grammars and
	the parameter file 'params.prm', as produced by runexp. """
	for n, stage in enumerate(stages):
		logging.info('reading: %s', stage.name)
		rules = gzip.open('%s/%s.rules.gz' % (resultdir, stage.name))
		lexicon = codecs.getreader('utf-8')(gzip.open('%s/%s.lex.gz' % (
				resultdir, stage.name)))
		grammar = Grammar(rules.read(), lexicon.read(),
				start=top, bitpar=stage.mode.startswith('pcfg'))
		backtransform = None
		if stage.dop:
			assert stage.useestimates is None, 'not supported'
			if stage.usedoubledop:
				backtransform = gzip.open('%s/%s.backtransform.gz' % (
						resultdir, stage.name)).read().splitlines()
				if n and stage.prune:
					_ = grammar.getmapping(stages[n - 1].grammar,
						striplabelre=re.compile(b'@.+$'),
						neverblockre=re.compile(b'^#[0-9]+|.+}<'),
						splitprune=stage.splitprune and stages[n - 1].split,
						markorigin=stages[n - 1].markorigin)
				else:
					# recoverfragments() relies on this mapping to identify
					# binarization nodes
					_ = grammar.getmapping(None,
						neverblockre=re.compile(b'.+}<'))
			elif n and stage.prune:  # dop reduction
				_ = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=re.compile(b'@[-0-9]+$'),
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				if stage.mode == 'dop-rerank':
					grammar.getrulemapping(stages[n - 1].grammar)
			probmodels = np.load('%s/%s.probs.npz' % (resultdir, stage.name))
			for name in probmodels.files:
				if name != 'default':
					grammar.register(unicode(name), probmodels[name])
		else:  # not stage.dop
			if n and stage.prune:
				_ = grammar.getmapping(stages[n - 1].grammar,
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
		if stage.mode == 'pcfg-bitpar':
			assert grammar.maxfanout == 1
		grammar.testgrammar()
		stage.update(grammar=grammar, backtransform=backtransform, outside=None)
	if postagging and postagging['method'] == 'unknownword':
		postagging['unknownwordfun'] = getunknownwordfun(postagging['model'])
		postagging['lexicon'] = {w for w in stages[0].grammar.lexicalbyword
				if not w.startswith(UNK)}
		postagging['sigs'] = {w for w in stages[0].grammar.lexicalbyword
				if w.startswith(UNK)}
예제 #2
0
파일: parser.py 프로젝트: tivaro/disco-dop
def readgrammars(resultdir, stages, postagging=None, top='ROOT'):
	"""Read the grammars from a previous experiment.

	Expects a directory ``resultdir`` which contains the relevant grammars and
	the parameter file ``params.prm``, as produced by ``runexp``."""
	for n, stage in enumerate(stages):
		logging.info('reading: %s', stage.name)
		rules = gzip.open('%s/%s.rules.gz' % (resultdir, stage.name)).read()
		lexicon = codecs.getreader('utf-8')(gzip.open('%s/%s.lex.gz' % (
				resultdir, stage.name)))
		grammar = Grammar(rules, lexicon.read(),
				start=top, bitpar=stage.mode.startswith('pcfg')
				or re.match(r'[-.e0-9]+\b', rules), binarized=stage.binarized)
		backtransform = outside = None
		if stage.dop:
			if stage.estimates is not None:
				raise ValueError('not supported')
			if stage.dop == 'doubledop':
				backtransform = gzip.open('%s/%s.backtransform.gz' % (
						resultdir, stage.name)).read().splitlines()
				if n and stage.prune:
					_ = grammar.getmapping(stages[n - 1].grammar,
						striplabelre=re.compile(b'@.+$'),
						neverblockre=re.compile(b'^#[0-9]+|.+}<'),
						splitprune=stage.splitprune and stages[n - 1].split,
						markorigin=stages[n - 1].markorigin)
				else:
					# recoverfragments() relies on this mapping to identify
					# binarization nodes
					_ = grammar.getmapping(None,
						neverblockre=re.compile(b'.+}<'))
			elif n and stage.prune:  # dop reduction
				_ = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=re.compile(b'@[-0-9]+$'),
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				if stage.mode == 'dop-rerank':
					grammar.getrulemapping(
							stages[n - 1].grammar, re.compile(br'@[-0-9]+\b'))
			probsfile = '%s/%s.probs.npz' % (resultdir, stage.name)
			if os.path.exists(probsfile):
				probmodels = np.load(probsfile)  # pylint: disable=no-member
				for name in probmodels.files:
					if name != 'default':
						grammar.register(unicode(name), probmodels[name])
		else:  # not stage.dop
			if n and stage.prune:
				_ = grammar.getmapping(stages[n - 1].grammar,
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
			if stage.estimates in ('SX', 'SXlrgaps'):
				if stage.estimates == 'SX' and grammar.maxfanout != 1:
					raise ValueError('SX estimate requires PCFG.')
				if stage.mode != 'plcfrs':
					raise ValueError('estimates require parser w/agenda.')
				outside = np.load(  # pylint: disable=no-member
						'%s/%s.outside.npz' % (resultdir, stage.name))['outside']
				logging.info('loaded %s estimates', stage.estimates)
			elif stage.estimates:
				raise ValueError('unrecognized value; specify SX or SXlrgaps.')

		if stage.mode.startswith('pcfg-bitpar'):
			if grammar.maxfanout != 1:
				raise ValueError('bitpar requires a PCFG.')

		_sumsto1, msg = grammar.testgrammar()
		logging.info('%s: %s', stage.name, msg)
		stage.update(grammar=grammar, backtransform=backtransform,
				outside=outside)
	if postagging and postagging.method == 'unknownword':
		postagging.unknownwordfun = UNKNOWNWORDFUNC[postagging.model]
		postagging.lexicon = {w for w in stages[0].grammar.lexicalbyword
				if not w.startswith(UNK)}
		postagging.sigs = {w for w in stages[0].grammar.lexicalbyword
				if w.startswith(UNK)}
예제 #3
0
파일: runexp.py 프로젝트: arne-cl/disco-dop
def getgrammars(trees, sents, stages, bintype, horzmarkov, vertmarkov, factor,
		tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead,
		fanout_marks_before_bin, testmaxwords, resultdir, numproc,
		lexmodel, simplelexsmooth, top, relationalrealizational):
	""" Apply binarization and read off the requested grammars. """
	# fixme: this n should correspond to sentence id
	tbfanout, n = treebankfanout(trees)
	logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s',
			tbfanout, n, trees[n], ' '.join(sents[n]))
	# binarization
	begin = time.clock()
	if fanout_marks_before_bin:
		trees = [addfanoutmarkers(t) for t in trees]
	if bintype == 'binarize':
		bintype += ' %s h=%d v=%d %s' % (factor, horzmarkov, vertmarkov,
			'tailmarker' if tailmarker else '')
		for a in trees:
			binarize(a, factor=factor, tailmarker=tailmarker,
					horzmarkov=horzmarkov, vertmarkov=vertmarkov,
					leftmostunary=leftmostunary, rightmostunary=rightmostunary,
					reverse=revmarkov, pospa=pospa,
					headidx=-1 if markhead else None,
					filterfuncs=(relationalrealizational['ignorefunctions']
						+ (relationalrealizational['adjunctionlabel'], ))
						if relationalrealizational else ())
	elif bintype == 'optimal':
		trees = [Tree.convert(optimalbinarize(tree))
						for n, tree in enumerate(trees)]
	elif bintype == 'optimalhead':
		trees = [Tree.convert(optimalbinarize(tree, headdriven=True,
				h=horzmarkov, v=vertmarkov)) for n, tree in enumerate(trees)]
	trees = [addfanoutmarkers(t) for t in trees]
	logging.info('binarized %s cpu time elapsed: %gs',
						bintype, time.clock() - begin)
	logging.info('binarized treebank fan-out: %d #%d', *treebankfanout(trees))
	trees = [canonicalize(a).freeze() for a in trees]

	for n, stage in enumerate(stages):
		if stage.split:
			traintrees = [binarize(splitdiscnodes(Tree.convert(a),
					stage.markorigin), childchar=':').freeze() for a in trees]
			logging.info('splitted discontinuous nodes')
		else:
			traintrees = trees
		if stage.mode.startswith('pcfg'):
			assert tbfanout == 1 or stage.split
		backtransform = None
		if stage.dop:
			if stage.usedoubledop:
				# find recurring fragments in treebank,
				# as well as depth 1 'cover' fragments
				fragments = getfragments(traintrees, sents, numproc,
						iterate=stage.iterate, complement=stage.complement)
				xgrammar, backtransform, altweights = doubledop(
						traintrees, fragments)
			else:  # DOP reduction
				xgrammar, altweights = dopreduction(
						traintrees, sents, packedgraph=stage.packedgraph)
			nodes = sum(len(list(a.subtrees())) for a in traintrees)
			if lexmodel and simplelexsmooth:
				newrules = simplesmoothlexicon(lexmodel)
				xgrammar.extend(newrules)
				for weights in altweights.values():
					weights.extend(w for _, w in newrules)
			elif lexmodel:
				xgrammar = smoothlexicon(xgrammar, lexmodel)
			msg = grammarinfo(xgrammar)
			rules, lexicon = write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			grammar = Grammar(rules, lexicon, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			for name in altweights:
				grammar.register(u'%s' % name, altweights[name])
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lexicon)
			logging.info('DOP model based on %d sentences, %d nodes, '
				'%d nonterminals', len(traintrees), nodes, len(grammar.toid))
			logging.info(msg)
			if stage.estimator != 'dop1':
				grammar.switch(u'%s' % stage.estimator)
			_sumsto1 = grammar.testgrammar()
			if stage.usedoubledop:
				# backtransform keys are line numbers to rules file;
				# to see them together do:
				# $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz)
				with codecs.getwriter('ascii')(gzip.open(
						'%s/%s.backtransform.gz' % (resultdir, stage.name),
						'w')) as out:
					out.writelines('%s\n' % a for a in backtransform)
				if n and stage.prune:
					msg = grammar.getmapping(stages[n - 1].grammar,
						striplabelre=None if stages[n - 1].dop
							else re.compile(b'@.+$'),
						neverblockre=re.compile(b'.+}<'),
						splitprune=stage.splitprune and stages[n - 1].split,
						markorigin=stages[n - 1].markorigin)
				else:
					# recoverfragments() relies on this mapping to identify
					# binarization nodes
					msg = grammar.getmapping(None,
						striplabelre=None,
						neverblockre=re.compile(b'.+}<'),
						splitprune=False, markorigin=False)
				logging.info(msg)
			elif n and stage.prune:  # dop reduction
				msg = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=None if stages[n - 1].dop
						and not stages[n - 1].usedoubledop
						else re.compile(b'@[-0-9]+$'),
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				if stage.mode == 'dop-rerank':
					grammar.getrulemapping(stages[n - 1].grammar)
				logging.info(msg)
			# write prob models
			np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name),
					**{name: mod for name, mod
						in zip(grammar.modelnames, grammar.models)})
		else:  # not stage.dop
			xgrammar = treebankgrammar(traintrees, sents)
			logging.info('induced %s based on %d sentences',
				('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'),
				len(traintrees))
			if stage.split or os.path.exists('%s/pcdist.txt' % resultdir):
				logging.info(grammarinfo(xgrammar))
			else:
				logging.info(grammarinfo(xgrammar,
						dump='%s/pcdist.txt' % resultdir))
			if lexmodel and simplelexsmooth:
				newrules = simplesmoothlexicon(lexmodel)
				xgrammar.extend(newrules)
			elif lexmodel:
				xgrammar = smoothlexicon(xgrammar, lexmodel)
			rules, lexicon = write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			grammar = Grammar(rules, lexicon, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lexicon)
			_sumsto1 = grammar.testgrammar()
			if n and stage.prune:
				msg = grammar.getmapping(stages[n - 1].grammar,
					striplabelre=None,
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				logging.info(msg)
		logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir,
				stage.name, ',backtransform' if stage.usedoubledop else '')

		outside = None
		if stage.getestimates == 'SX':
			assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.'
			logging.info('computing PCFG estimates')
			begin = time.clock()
			outside = getpcfgestimates(grammar, testmaxwords,
					grammar.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
					time.clock() - begin)
			np.savez('pcfgoutside.npz', outside=outside)
			logging.info('saved PCFG estimates')
		elif stage.useestimates == 'SX':
			assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.'
			assert stage.mode != 'pcfg', (
					'estimates require agenda-based parser.')
			outside = np.load('pcfgoutside.npz')['outside']
			logging.info('loaded PCFG estimates')
		if stage.getestimates == 'SXlrgaps':
			logging.info('computing PLCFRS estimates')
			begin = time.clock()
			outside = getestimates(grammar, testmaxwords,
					grammar.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
						time.clock() - begin)
			np.savez('outside.npz', outside=outside)
			logging.info('saved estimates')
		elif stage.useestimates == 'SXlrgaps':
			outside = np.load('outside.npz')['outside']
			logging.info('loaded PLCFRS estimates')
		stage.update(grammar=grammar, backtransform=backtransform,
				outside=outside)
예제 #4
0
파일: runexp.py 프로젝트: tivaro/disco-dop
def getgrammars(trees, sents, stages, testmaxwords, resultdir,
		numproc, lexmodel, simplelexsmooth, top):
	"""Read off the requested grammars."""
	tbfanout, n = treebank.treebankfanout(trees)
	logging.info('binarized treebank fan-out: %d #%d', tbfanout, n)
	for n, stage in enumerate(stages):
		if stage.split:
			traintrees = [treetransforms.binarize(
					treetransforms.splitdiscnodes(
						Tree.convert(a), stage.markorigin),
					childchar=':', dot=True, ids=grammar.UniqueIDs()).freeze()
					for a in trees]
			logging.info('splitted discontinuous nodes')
		else:
			traintrees = trees
		if stage.mode.startswith('pcfg'):
			if tbfanout != 1 and not stage.split:
				raise ValueError('Cannot extract PCFG from treebank '
						'with discontinuities.')
		backtransform = extrarules = None
		if lexmodel and simplelexsmooth:
			extrarules = lexicon.simplesmoothlexicon(lexmodel)
		if stage.dop:
			if stage.dop == 'doubledop':
				(xgrammar, backtransform, altweights, fragments
					) = grammar.doubledop(
						traintrees, sents, binarized=stage.binarized,
						iterate=stage.iterate, complement=stage.complement,
						numproc=numproc, extrarules=extrarules)
				# dump fragments
				with codecs.getwriter('utf-8')(gzip.open('%s/%s.fragments.gz' %
						(resultdir, stage.name), 'w')) as out:
					out.writelines('%s\t%d\n' % (treebank.writetree(a, b, 0,
							'bracket' if stage.mode.startswith('pcfg')
							else 'discbracket').rstrip(), sum(c.values()))
							for (a, b), c in fragments.items())
			elif stage.dop == 'reduction':
				xgrammar, altweights = grammar.dopreduction(
						traintrees, sents, packedgraph=stage.packedgraph,
						extrarules=extrarules)
			else:
				raise ValueError('unrecognized DOP model: %r' % stage.dop)
			nodes = sum(len(list(a.subtrees())) for a in traintrees)
			if lexmodel and not simplelexsmooth:  # FIXME: altweights?
				xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel)
			msg = grammar.grammarinfo(xgrammar)
			rules, lex = grammar.write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			gram = Grammar(rules, lex, start=top,
					bitpar=stage.mode.startswith('pcfg'),
					binarized=stage.binarized)
			for name in altweights:
				gram.register(u'%s' % name, altweights[name])
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lex)
			logging.info('DOP model based on %d sentences, %d nodes, '
				'%d nonterminals', len(traintrees), nodes, len(gram.toid))
			logging.info(msg)
			if stage.estimator != 'rfe':
				gram.switch(u'%s' % stage.estimator)
			logging.info(gram.testgrammar()[1])
			if stage.dop == 'doubledop':
				# backtransform keys are line numbers to rules file;
				# to see them together do:
				# $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz)
				with codecs.getwriter('ascii')(gzip.open(
						'%s/%s.backtransform.gz' % (resultdir, stage.name),
						'w')) as out:
					out.writelines('%s\n' % a for a in backtransform)
				if n and stage.prune:
					msg = gram.getmapping(stages[n - 1].grammar,
						striplabelre=None if stages[n - 1].dop
							else re.compile(b'@.+$'),
						neverblockre=re.compile(b'.+}<'),
						splitprune=stage.splitprune and stages[n - 1].split,
						markorigin=stages[n - 1].markorigin)
				else:
					# recoverfragments() relies on this mapping to identify
					# binarization nodes
					msg = gram.getmapping(None,
						striplabelre=None,
						neverblockre=re.compile(b'.+}<'),
						splitprune=False, markorigin=False)
				logging.info(msg)
			elif n and stage.prune:  # dop reduction
				msg = gram.getmapping(stages[n - 1].grammar,
					striplabelre=None if stages[n - 1].dop
						and stages[n - 1].dop != 'doubledop'
						else re.compile(b'@[-0-9]+$'),
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				if stage.mode == 'dop-rerank':
					gram.getrulemapping(
							stages[n - 1].grammar, re.compile(br'@[-0-9]+\b'))
				logging.info(msg)
			# write prob models
			np.savez_compressed(  # pylint: disable=no-member
					'%s/%s.probs.npz' % (resultdir, stage.name),
					**{name: mod for name, mod
						in zip(gram.modelnames, gram.models)})
		else:  # not stage.dop
			xgrammar = grammar.treebankgrammar(traintrees, sents,
					extrarules=extrarules)
			logging.info('induced %s based on %d sentences',
				('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'),
				len(traintrees))
			if stage.split or os.path.exists('%s/pcdist.txt' % resultdir):
				logging.info(grammar.grammarinfo(xgrammar))
			else:
				logging.info(grammar.grammarinfo(xgrammar,
						dump='%s/pcdist.txt' % resultdir))
			if lexmodel and not simplelexsmooth:
				xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel)
			rules, lex = grammar.write_lcfrs_grammar(
					xgrammar, bitpar=stage.mode.startswith('pcfg'))
			gram = Grammar(rules, lex, start=top,
					bitpar=stage.mode.startswith('pcfg'))
			with gzip.open('%s/%s.rules.gz' % (
					resultdir, stage.name), 'wb') as rulesfile:
				rulesfile.write(rules)
			with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % (
					resultdir, stage.name), 'wb')) as lexiconfile:
				lexiconfile.write(lex)
			logging.info(gram.testgrammar()[1])
			if n and stage.prune:
				msg = gram.getmapping(stages[n - 1].grammar,
					striplabelre=None,
					neverblockre=re.compile(stage.neverblockre)
						if stage.neverblockre else None,
					splitprune=stage.splitprune and stages[n - 1].split,
					markorigin=stages[n - 1].markorigin)
				logging.info(msg)
		logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz',
				resultdir, stage.name,
				',backtransform' if stage.dop == 'doubledop' else '')

		outside = None
		if stage.estimates in ('SX', 'SXlrgaps'):
			if stage.estimates == 'SX' and tbfanout != 1 and not stage.split:
				raise ValueError('SX estimate requires PCFG.')
			elif stage.mode != 'plcfrs':
				raise ValueError('estimates require parser w/agenda.')
			begin = time.clock()
			logging.info('computing %s estimates', stage.estimates)
			if stage.estimates == 'SX':
				outside = estimates.getpcfgestimates(gram, testmaxwords,
						gram.toid[trees[0].label])
			elif stage.estimates == 'SXlrgaps':
				outside = estimates.getestimates(gram, testmaxwords,
						gram.toid[trees[0].label])
			logging.info('estimates done. cpu time elapsed: %gs',
					time.clock() - begin)
			np.savez_compressed(  # pylint: disable=no-member
					'%s/%s.outside.npz' % (resultdir, stage.name),
					outside=outside)
			logging.info('saved %s estimates', stage.estimates)
		elif stage.estimates:
			raise ValueError('unrecognized value; specify SX or SXlrgaps.')

		stage.update(grammar=gram, backtransform=backtransform,
				outside=outside)