def readgrammars(resultdir, stages, postagging=None, top='ROOT'): """ Read the grammars from a previous experiment. Expects a directory 'resultdir' which contains the relevant grammars and the parameter file 'params.prm', as produced by runexp. """ for n, stage in enumerate(stages): logging.info('reading: %s', stage.name) rules = gzip.open('%s/%s.rules.gz' % (resultdir, stage.name)) lexicon = codecs.getreader('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name))) grammar = Grammar(rules.read(), lexicon.read(), start=top, bitpar=stage.mode.startswith('pcfg')) backtransform = None if stage.dop: assert stage.useestimates is None, 'not supported' if stage.usedoubledop: backtransform = gzip.open('%s/%s.backtransform.gz' % ( resultdir, stage.name)).read().splitlines() if n and stage.prune: _ = grammar.getmapping(stages[n - 1].grammar, striplabelre=re.compile(b'@.+$'), neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes _ = grammar.getmapping(None, neverblockre=re.compile(b'.+}<')) elif n and stage.prune: # dop reduction _ = grammar.getmapping(stages[n - 1].grammar, striplabelre=re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': grammar.getrulemapping(stages[n - 1].grammar) probmodels = np.load('%s/%s.probs.npz' % (resultdir, stage.name)) for name in probmodels.files: if name != 'default': grammar.register(unicode(name), probmodels[name]) else: # not stage.dop if n and stage.prune: _ = grammar.getmapping(stages[n - 1].grammar, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'pcfg-bitpar': assert grammar.maxfanout == 1 grammar.testgrammar() stage.update(grammar=grammar, backtransform=backtransform, outside=None) if postagging and postagging['method'] == 'unknownword': postagging['unknownwordfun'] = getunknownwordfun(postagging['model']) postagging['lexicon'] = {w for w in stages[0].grammar.lexicalbyword if not w.startswith(UNK)} postagging['sigs'] = {w for w in stages[0].grammar.lexicalbyword if w.startswith(UNK)}
def readgrammars(resultdir, stages, postagging=None, top='ROOT'): """Read the grammars from a previous experiment. Expects a directory ``resultdir`` which contains the relevant grammars and the parameter file ``params.prm``, as produced by ``runexp``.""" for n, stage in enumerate(stages): logging.info('reading: %s', stage.name) rules = gzip.open('%s/%s.rules.gz' % (resultdir, stage.name)).read() lexicon = codecs.getreader('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name))) grammar = Grammar(rules, lexicon.read(), start=top, bitpar=stage.mode.startswith('pcfg') or re.match(r'[-.e0-9]+\b', rules), binarized=stage.binarized) backtransform = outside = None if stage.dop: if stage.estimates is not None: raise ValueError('not supported') if stage.dop == 'doubledop': backtransform = gzip.open('%s/%s.backtransform.gz' % ( resultdir, stage.name)).read().splitlines() if n and stage.prune: _ = grammar.getmapping(stages[n - 1].grammar, striplabelre=re.compile(b'@.+$'), neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes _ = grammar.getmapping(None, neverblockre=re.compile(b'.+}<')) elif n and stage.prune: # dop reduction _ = grammar.getmapping(stages[n - 1].grammar, striplabelre=re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': grammar.getrulemapping( stages[n - 1].grammar, re.compile(br'@[-0-9]+\b')) probsfile = '%s/%s.probs.npz' % (resultdir, stage.name) if os.path.exists(probsfile): probmodels = np.load(probsfile) # pylint: disable=no-member for name in probmodels.files: if name != 'default': grammar.register(unicode(name), probmodels[name]) else: # not stage.dop if n and stage.prune: _ = grammar.getmapping(stages[n - 1].grammar, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.estimates in ('SX', 'SXlrgaps'): if stage.estimates == 'SX' and grammar.maxfanout != 1: raise ValueError('SX estimate requires PCFG.') if stage.mode != 'plcfrs': raise ValueError('estimates require parser w/agenda.') outside = np.load( # pylint: disable=no-member '%s/%s.outside.npz' % (resultdir, stage.name))['outside'] logging.info('loaded %s estimates', stage.estimates) elif stage.estimates: raise ValueError('unrecognized value; specify SX or SXlrgaps.') if stage.mode.startswith('pcfg-bitpar'): if grammar.maxfanout != 1: raise ValueError('bitpar requires a PCFG.') _sumsto1, msg = grammar.testgrammar() logging.info('%s: %s', stage.name, msg) stage.update(grammar=grammar, backtransform=backtransform, outside=outside) if postagging and postagging.method == 'unknownword': postagging.unknownwordfun = UNKNOWNWORDFUNC[postagging.model] postagging.lexicon = {w for w in stages[0].grammar.lexicalbyword if not w.startswith(UNK)} postagging.sigs = {w for w in stages[0].grammar.lexicalbyword if w.startswith(UNK)}
def getgrammars(trees, sents, stages, bintype, horzmarkov, vertmarkov, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top, relationalrealizational): """ Apply binarization and read off the requested grammars. """ # fixme: this n should correspond to sentence id tbfanout, n = treebankfanout(trees) logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = time.clock() if fanout_marks_before_bin: trees = [addfanoutmarkers(t) for t in trees] if bintype == 'binarize': bintype += ' %s h=%d v=%d %s' % (factor, horzmarkov, vertmarkov, 'tailmarker' if tailmarker else '') for a in trees: binarize(a, factor=factor, tailmarker=tailmarker, horzmarkov=horzmarkov, vertmarkov=vertmarkov, leftmostunary=leftmostunary, rightmostunary=rightmostunary, reverse=revmarkov, pospa=pospa, headidx=-1 if markhead else None, filterfuncs=(relationalrealizational['ignorefunctions'] + (relationalrealizational['adjunctionlabel'], )) if relationalrealizational else ()) elif bintype == 'optimal': trees = [Tree.convert(optimalbinarize(tree)) for n, tree in enumerate(trees)] elif bintype == 'optimalhead': trees = [Tree.convert(optimalbinarize(tree, headdriven=True, h=horzmarkov, v=vertmarkov)) for n, tree in enumerate(trees)] trees = [addfanoutmarkers(t) for t in trees] logging.info('binarized %s cpu time elapsed: %gs', bintype, time.clock() - begin) logging.info('binarized treebank fan-out: %d #%d', *treebankfanout(trees)) trees = [canonicalize(a).freeze() for a in trees] for n, stage in enumerate(stages): if stage.split: traintrees = [binarize(splitdiscnodes(Tree.convert(a), stage.markorigin), childchar=':').freeze() for a in trees] logging.info('splitted discontinuous nodes') else: traintrees = trees if stage.mode.startswith('pcfg'): assert tbfanout == 1 or stage.split backtransform = None if stage.dop: if stage.usedoubledop: # find recurring fragments in treebank, # as well as depth 1 'cover' fragments fragments = getfragments(traintrees, sents, numproc, iterate=stage.iterate, complement=stage.complement) xgrammar, backtransform, altweights = doubledop( traintrees, fragments) else: # DOP reduction xgrammar, altweights = dopreduction( traintrees, sents, packedgraph=stage.packedgraph) nodes = sum(len(list(a.subtrees())) for a in traintrees) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) for weights in altweights.values(): weights.extend(w for _, w in newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) msg = grammarinfo(xgrammar) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) for name in altweights: grammar.register(u'%s' % name, altweights[name]) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, len(grammar.toid)) logging.info(msg) if stage.estimator != 'dop1': grammar.switch(u'%s' % stage.estimator) _sumsto1 = grammar.testgrammar() if stage.usedoubledop: # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('ascii')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\n' % a for a in backtransform) if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop else re.compile(b'@.+$'), neverblockre=re.compile(b'.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes msg = grammar.getmapping(None, striplabelre=None, neverblockre=re.compile(b'.+}<'), splitprune=False, markorigin=False) logging.info(msg) elif n and stage.prune: # dop reduction msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop and not stages[n - 1].usedoubledop else re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': grammar.getrulemapping(stages[n - 1].grammar) logging.info(msg) # write prob models np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name), **{name: mod for name, mod in zip(grammar.modelnames, grammar.models)}) else: # not stage.dop xgrammar = treebankgrammar(traintrees, sents) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammarinfo(xgrammar)) else: logging.info(grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) _sumsto1 = grammar.testgrammar() if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.usedoubledop else '') outside = None if stage.getestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' logging.info('computing PCFG estimates') begin = time.clock() outside = getpcfgestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('pcfgoutside.npz', outside=outside) logging.info('saved PCFG estimates') elif stage.useestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' assert stage.mode != 'pcfg', ( 'estimates require agenda-based parser.') outside = np.load('pcfgoutside.npz')['outside'] logging.info('loaded PCFG estimates') if stage.getestimates == 'SXlrgaps': logging.info('computing PLCFRS estimates') begin = time.clock() outside = getestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('outside.npz', outside=outside) logging.info('saved estimates') elif stage.useestimates == 'SXlrgaps': outside = np.load('outside.npz')['outside'] logging.info('loaded PLCFRS estimates') stage.update(grammar=grammar, backtransform=backtransform, outside=outside)
def getgrammars(trees, sents, stages, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top): """Read off the requested grammars.""" tbfanout, n = treebank.treebankfanout(trees) logging.info('binarized treebank fan-out: %d #%d', tbfanout, n) for n, stage in enumerate(stages): if stage.split: traintrees = [treetransforms.binarize( treetransforms.splitdiscnodes( Tree.convert(a), stage.markorigin), childchar=':', dot=True, ids=grammar.UniqueIDs()).freeze() for a in trees] logging.info('splitted discontinuous nodes') else: traintrees = trees if stage.mode.startswith('pcfg'): if tbfanout != 1 and not stage.split: raise ValueError('Cannot extract PCFG from treebank ' 'with discontinuities.') backtransform = extrarules = None if lexmodel and simplelexsmooth: extrarules = lexicon.simplesmoothlexicon(lexmodel) if stage.dop: if stage.dop == 'doubledop': (xgrammar, backtransform, altweights, fragments ) = grammar.doubledop( traintrees, sents, binarized=stage.binarized, iterate=stage.iterate, complement=stage.complement, numproc=numproc, extrarules=extrarules) # dump fragments with codecs.getwriter('utf-8')(gzip.open('%s/%s.fragments.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\t%d\n' % (treebank.writetree(a, b, 0, 'bracket' if stage.mode.startswith('pcfg') else 'discbracket').rstrip(), sum(c.values())) for (a, b), c in fragments.items()) elif stage.dop == 'reduction': xgrammar, altweights = grammar.dopreduction( traintrees, sents, packedgraph=stage.packedgraph, extrarules=extrarules) else: raise ValueError('unrecognized DOP model: %r' % stage.dop) nodes = sum(len(list(a.subtrees())) for a in traintrees) if lexmodel and not simplelexsmooth: # FIXME: altweights? xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel) msg = grammar.grammarinfo(xgrammar) rules, lex = grammar.write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) gram = Grammar(rules, lex, start=top, bitpar=stage.mode.startswith('pcfg'), binarized=stage.binarized) for name in altweights: gram.register(u'%s' % name, altweights[name]) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lex) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, len(gram.toid)) logging.info(msg) if stage.estimator != 'rfe': gram.switch(u'%s' % stage.estimator) logging.info(gram.testgrammar()[1]) if stage.dop == 'doubledop': # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('ascii')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\n' % a for a in backtransform) if n and stage.prune: msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop else re.compile(b'@.+$'), neverblockre=re.compile(b'.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes msg = gram.getmapping(None, striplabelre=None, neverblockre=re.compile(b'.+}<'), splitprune=False, markorigin=False) logging.info(msg) elif n and stage.prune: # dop reduction msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop and stages[n - 1].dop != 'doubledop' else re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': gram.getrulemapping( stages[n - 1].grammar, re.compile(br'@[-0-9]+\b')) logging.info(msg) # write prob models np.savez_compressed( # pylint: disable=no-member '%s/%s.probs.npz' % (resultdir, stage.name), **{name: mod for name, mod in zip(gram.modelnames, gram.models)}) else: # not stage.dop xgrammar = grammar.treebankgrammar(traintrees, sents, extrarules=extrarules) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammar.grammarinfo(xgrammar)) else: logging.info(grammar.grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) if lexmodel and not simplelexsmooth: xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel) rules, lex = grammar.write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) gram = Grammar(rules, lex, start=top, bitpar=stage.mode.startswith('pcfg')) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lex) logging.info(gram.testgrammar()[1]) if n and stage.prune: msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.dop == 'doubledop' else '') outside = None if stage.estimates in ('SX', 'SXlrgaps'): if stage.estimates == 'SX' and tbfanout != 1 and not stage.split: raise ValueError('SX estimate requires PCFG.') elif stage.mode != 'plcfrs': raise ValueError('estimates require parser w/agenda.') begin = time.clock() logging.info('computing %s estimates', stage.estimates) if stage.estimates == 'SX': outside = estimates.getpcfgestimates(gram, testmaxwords, gram.toid[trees[0].label]) elif stage.estimates == 'SXlrgaps': outside = estimates.getestimates(gram, testmaxwords, gram.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez_compressed( # pylint: disable=no-member '%s/%s.outside.npz' % (resultdir, stage.name), outside=outside) logging.info('saved %s estimates', stage.estimates) elif stage.estimates: raise ValueError('unrecognized value; specify SX or SXlrgaps.') stage.update(grammar=gram, backtransform=backtransform, outside=outside)