def test_optimalbinarize(): """Verify that all optimal parsing complexities are lower than or equal to the complexities of right-to-left binarizations.""" from discodop.treetransforms import optimalbinarize, complexityfanout from discodop.treebank import NegraCorpusReader corpus = NegraCorpusReader('alpinosample.export', punct='move') total = violations = violationshd = 0 for n, (tree, sent) in enumerate(zip(list( corpus.trees().values())[:-2000], corpus.sents().values())): t = addbitsets(tree) if all(fanout(x) == 1 for x in t.subtrees()): continue print(n, tree, '\n', ' '.join(sent)) total += 1 optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1) # undo head-ordering to get a normal right-to-left binarization normbin = addbitsets(binarize(canonicalize(Tree.convert(tree)))) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('non-hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violations += 1 optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1) normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1)) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violationshd += 1 print('opt. bin. violations normal: %d / %d; hd: %d / %d' % ( violations, total, violationshd, total)) assert violations == violationshd == 0
def decorate(self, tree, sent): """Return a copy of tree with labels decorated with IDs. >>> d = TreeDecorator() >>> tree = Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))", parse_leaf=int) >>> d.decorate(tree, ['the', 'dog', 'walks']) ... # doctest: +NORMALIZE_WHITESPACE Tree('S', [Tree('NP@1-0', [Tree('DT@1-1', [0]), Tree('N@1-2', [1])]), Tree('VP@1-3', [2])]) >>> d = TreeDecorator(memoize=True) >>> print(d.decorate(Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))", ... parse_leaf=int), ['the', 'dog', 'walks'])) (S (NP@1-1 (DT@1-2 0) (N@1-3 1)) (VP@1-4 2)) >>> print(d.decorate(Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))", ... parse_leaf=int), ['the', 'dog', 'barks'])) (S (NP@1-1 (DT@1-2 0) (N@1-3 1)) (VP@2-4 2))""" if self.memoize: self.ids = 0 # wrap tree to get equality wrt sent tree = DiscTree(tree.freeze(), sent) dectree = ImmutableTree(tree.label, map(self._recdecorate, tree)) else: dectree = Tree.convert(tree.copy(True)) # skip top node, should not get an ID for m, a in enumerate(islice(dectree.subtrees(), 1, None)): a.label = "%s@%d-%d" % (a.label, self.n, m) self.n += 1 return dectree
def optimalbinarize(tree, sep='|', headdriven=False, h=None, v=1): """ Recursively binarize a tree, optimizing for complexity. v=0 is not implemented. Setting h to a nonzero integer restricts the possible binarizations to head driven binarizations. """ if h is None: tree = Tree.convert(tree) for a in list(tree.subtrees(lambda x: len(x) > 1))[::-1]: a.sort(key=lambda x: x.leaves()) return recbinarizetree(addbitsets(tree), sep, headdriven, h or 999, v, ())
def dobinarization(trees, sents, binarization, relationalrealizational): """Apply binarization.""" # fixme: this n should correspond to sentence id tbfanout, n = treebank.treebankfanout(trees) logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = time.clock() msg = 'binarization: %s' % binarization.method if binarization.fanout_marks_before_bin: trees = [treetransforms.addfanoutmarkers(t) for t in trees] if binarization.method is None: pass elif binarization.method == 'default': msg += ' %s h=%d v=%d %s' % ( binarization.factor, binarization.h, binarization.v, 'tailmarker' if binarization.tailmarker else '') for a in trees: treetransforms.binarize(a, factor=binarization.factor, tailmarker=binarization.tailmarker, horzmarkov=binarization.h, vertmarkov=binarization.v, leftmostunary=binarization.leftmostunary, rightmostunary=binarization.rightmostunary, reverse=binarization.revmarkov, headidx=-1 if binarization.markhead else None, filterfuncs=(relationalrealizational['ignorefunctions'] + (relationalrealizational['adjunctionlabel'], )) if relationalrealizational else (), labelfun=binarization.labelfun) elif binarization.method == 'optimal': trees = [Tree.convert(treetransforms.optimalbinarize(tree)) for n, tree in enumerate(trees)] elif binarization.method == 'optimalhead': msg += ' h=%d v=%d' % ( binarization.h, binarization.v) trees = [Tree.convert(treetransforms.optimalbinarize( tree, headdriven=True, h=binarization.h, v=binarization.v)) for n, tree in enumerate(trees)] trees = [treetransforms.addfanoutmarkers(t) for t in trees] logging.info('%s; cpu time elapsed: %gs', msg, time.clock() - begin) trees = [treetransforms.canonicalize(a).freeze() for a in trees] return trees
config.read(argv[1]) data = SupertagParseDataset(f"{config['Corpus']['filename']}.train") from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from discodop.lexgrammar import SupertagGrammar grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb")) i = 0 evaluator = Evaluator(readparam("proper.prm")) for sentence in data: words = tuple(t.text for t in sentence) poss = tuple(t.get_tag("pos").value for t in sentence) tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence) parses = grammar.parse(poss, tags, posmode=True) try: parse = next(parses) except StopIteration: leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words)))) parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})") gold = ParentedTree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(gold)))) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(parse)))) evaluator.add(i, gold.copy(deep=True), list(words), parse.copy(deep=True), list(words)) i += 1 print(evaluator.summary())
cp = ConfigParser() cp.read(argv[1]) config = corpusparam(**cp["Corpus"], **cp["Grammar"]) from discodop.tree import Tree from discodop.treebank import READERS from discodop.treetransforms import addfanoutmarkers, binarize, collapseunary from discodop.lexgrammar import SupertagCorpus, SupertagGrammar corpus = READERS[config.inputfmt](config.filename, encoding=config.inputenc, punct="move") trees = [ addfanoutmarkers( binarize( collapseunary( Tree.convert(t), collapseroot=True, collapsepos=True), horzmarkov=config.h, vertmarkov=config.v)) for t in corpus.trees().values()] sents = list(corpus.sents().values()) corpus = SupertagCorpus(trees, sents) size = len(corpus.sent_corpus) portions = config.split.split() names = "train dev test".split() assert len(portions) in [3,4] if portions[0] == "debug": portions = tuple(int(portion) for portion in portions[1:2]+portions[1:]) limits = tuple((name, slice(0, end)) for name, end in zip(names, portions)) else:
def main(): """Command line interface for applying tree(bank) transforms.""" import io from getopt import gnu_getopt, GetoptError from discodop import treebanktransforms actions = {'none': None, 'introducepreterminals': introducepreterminals, 'splitdisc': None, 'mergedisc': mergediscnodes, 'transform': None, 'unbinarize': unbinarize, 'binarize': None, 'optimalbinarize': None} flags = ('markorigin markheads leftunary rightunary tailmarker ' 'renumber reverse'.split()) options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= ' 'punct= headrules= functions= morphology= lemmas= factor= ' 'markorigin= maxlen= fmt= enc= transforms=').split() try: opts, args = gnu_getopt(sys.argv[1:], 'h:v:', flags + options) if not 1 <= len(args) <= 3: raise GetoptError('error: expected 1, 2, or 3 positional arguments') except GetoptError as err: print('error: %r\n%s' % (err, USAGE), file=sys.stderr) sys.exit(2) opts, action = dict(opts), args[0] if action not in actions: print('unrecognized action: %r\navailable actions: %s' % ( action, ', '.join(actions)), file=sys.stderr) sys.exit(2) if '--fmt' in opts: opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt'] if '--enc' in opts: opts['--inputenc'] = opts['--outputenc'] = opts['--enc'] if opts.get('--outputfmt', WRITERS[0]) not in WRITERS: print('unrecognized output format: %r\navailable formats: %s' % ( opts.get('--outputfmt'), ' '.join(WRITERS)), file=sys.stderr) sys.exit(2) infilename = args[1] if len(args) >= 2 and args[1] != '-' else '/dev/stdin' outfilename = args[2] if len(args) == 3 and args[2] != '-' else '/dev/stdout' # open corpus corpus = READERS[opts.get('--inputfmt', 'export')]( infilename, encoding=opts.get('--inputenc', 'utf-8'), headrules=opts.get('--headrules'), markheads='--markheads' in opts, ensureroot=opts.get('--ensureroot'), punct=opts.get('--punct'), functions=opts.get('--functions'), morphology=opts.get('--morphology'), lemmas=opts.get('--lemmas')) start, end = opts.get('--slice', ':').split(':') start, end = (int(start) if start else None), (int(end) if end else None) trees = corpus.itertrees(start, end) if '--maxlen' in opts: maxlen = int(opts['--maxlen']) trees = ((key, (tree, sent)) for key, (tree, sent) in trees if len(sent) <= maxlen) if '--renumber' in opts: trees = (('%8d' % n, treesent) for n, (_, treesent) in enumerate(trees, 1)) # select transformation transform = actions[action] if action in ('binarize', 'optimalbinarize'): h = int(opts.get('-h', 999)) v = int(opts.get('-v', 1)) if action == 'binarize': factor = opts.get('--factor', 'right') transform = lambda t, _: binarize(t, factor, h, v, leftmostunary='--leftunary' in opts, rightmostunary='--rightunary' in opts, tailmarker='$' if '--tailmarker' in opts else '') elif action == 'optimalbinarize': headdriven = '--headrules' in opts transform = lambda t, _: optimalbinarize(t, '|', headdriven, h, v) elif action == 'splitdisc': transform = lambda t, _: splitdiscnodes(t, '--markorigin' in opts) elif action == 'unbinarize': transform = lambda t, _: unbinarize(Tree.convert(t)) elif action == 'transform': tfs = opts['--transforms'].split(',') transform = lambda t, s: (treebanktransforms.reversetransform(t, tfs) if '--reverse' in opts else treebanktransforms.transform(t, s, tfs)) if transform is not None: # NB: transform cannot affect (no. of) terminals trees = ((key, (transform(tree, sent), sent)) for key, (tree, sent) in trees) # read, transform, & write trees headrules = None if opts.get('--outputfmt') in ('mst', 'conll'): if not opts.get('--headrules'): raise ValueError('need head rules for dependency conversion') headrules = treebanktransforms.readheadrules(opts.get('--headrules')) cnt = 0 if opts.get('--outputfmt') == 'dact': import alpinocorpus outfile = alpinocorpus.CorpusWriter(outfilename) if (action == 'none' and opts.get('--inputfmt') in ('alpino', 'dact') and set(opts) <= {'--slice', '--inputfmt', '--outputfmt', '--renumber'}): for n, (key, block) in islice(enumerate( corpus.blocks().items(), 1), start, end): outfile.write('%8d' % n if '--renumber' in opts else key, block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(str(key), writetree(tree, sent, key, 'alpino')) cnt += 1 else: encoding = opts.get('outputenc', 'utf-8') outfile = io.open(outfilename, 'w', encoding=encoding) # copy trees verbatim when only taking slice or converting encoding if (action == 'none' and opts.get('--inputfmt') == opts.get( '--outputfmt') and set(opts) <= {'--slice', '--inputenc', '--outputenc', '--inputfmt', '--outputfmt'}): for block in islice(corpus.blocks().values(), start, end): outfile.write(block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(writetree(tree, sent, key, opts.get('--outputfmt', 'export'), headrules)) cnt += 1 print('%sed %d trees with action %r' % ('convert' if action == 'none' else 'transform', cnt, action), file=sys.stderr)
def getgrammars(trees, sents, stages, bintype, horzmarkov, vertmarkov, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top, relationalrealizational): """ Apply binarization and read off the requested grammars. """ # fixme: this n should correspond to sentence id tbfanout, n = treebankfanout(trees) logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = time.clock() if fanout_marks_before_bin: trees = [addfanoutmarkers(t) for t in trees] if bintype == 'binarize': bintype += ' %s h=%d v=%d %s' % (factor, horzmarkov, vertmarkov, 'tailmarker' if tailmarker else '') for a in trees: binarize(a, factor=factor, tailmarker=tailmarker, horzmarkov=horzmarkov, vertmarkov=vertmarkov, leftmostunary=leftmostunary, rightmostunary=rightmostunary, reverse=revmarkov, pospa=pospa, headidx=-1 if markhead else None, filterfuncs=(relationalrealizational['ignorefunctions'] + (relationalrealizational['adjunctionlabel'], )) if relationalrealizational else ()) elif bintype == 'optimal': trees = [Tree.convert(optimalbinarize(tree)) for n, tree in enumerate(trees)] elif bintype == 'optimalhead': trees = [Tree.convert(optimalbinarize(tree, headdriven=True, h=horzmarkov, v=vertmarkov)) for n, tree in enumerate(trees)] trees = [addfanoutmarkers(t) for t in trees] logging.info('binarized %s cpu time elapsed: %gs', bintype, time.clock() - begin) logging.info('binarized treebank fan-out: %d #%d', *treebankfanout(trees)) trees = [canonicalize(a).freeze() for a in trees] for n, stage in enumerate(stages): if stage.split: traintrees = [binarize(splitdiscnodes(Tree.convert(a), stage.markorigin), childchar=':').freeze() for a in trees] logging.info('splitted discontinuous nodes') else: traintrees = trees if stage.mode.startswith('pcfg'): assert tbfanout == 1 or stage.split backtransform = None if stage.dop: if stage.usedoubledop: # find recurring fragments in treebank, # as well as depth 1 'cover' fragments fragments = getfragments(traintrees, sents, numproc, iterate=stage.iterate, complement=stage.complement) xgrammar, backtransform, altweights = doubledop( traintrees, fragments) else: # DOP reduction xgrammar, altweights = dopreduction( traintrees, sents, packedgraph=stage.packedgraph) nodes = sum(len(list(a.subtrees())) for a in traintrees) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) for weights in altweights.values(): weights.extend(w for _, w in newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) msg = grammarinfo(xgrammar) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) for name in altweights: grammar.register(u'%s' % name, altweights[name]) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, len(grammar.toid)) logging.info(msg) if stage.estimator != 'dop1': grammar.switch(u'%s' % stage.estimator) _sumsto1 = grammar.testgrammar() if stage.usedoubledop: # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('ascii')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\n' % a for a in backtransform) if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop else re.compile(b'@.+$'), neverblockre=re.compile(b'.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes msg = grammar.getmapping(None, striplabelre=None, neverblockre=re.compile(b'.+}<'), splitprune=False, markorigin=False) logging.info(msg) elif n and stage.prune: # dop reduction msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop and not stages[n - 1].usedoubledop else re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': grammar.getrulemapping(stages[n - 1].grammar) logging.info(msg) # write prob models np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name), **{name: mod for name, mod in zip(grammar.modelnames, grammar.models)}) else: # not stage.dop xgrammar = treebankgrammar(traintrees, sents) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammarinfo(xgrammar)) else: logging.info(grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) _sumsto1 = grammar.testgrammar() if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.usedoubledop else '') outside = None if stage.getestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' logging.info('computing PCFG estimates') begin = time.clock() outside = getpcfgestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('pcfgoutside.npz', outside=outside) logging.info('saved PCFG estimates') elif stage.useestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' assert stage.mode != 'pcfg', ( 'estimates require agenda-based parser.') outside = np.load('pcfgoutside.npz')['outside'] logging.info('loaded PCFG estimates') if stage.getestimates == 'SXlrgaps': logging.info('computing PLCFRS estimates') begin = time.clock() outside = getestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('outside.npz', outside=outside) logging.info('saved estimates') elif stage.useestimates == 'SXlrgaps': outside = np.load('outside.npz')['outside'] logging.info('loaded PLCFRS estimates') stage.update(grammar=grammar, backtransform=backtransform, outside=outside)
def getgrammars(trees, sents, stages, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top): """Read off the requested grammars.""" tbfanout, n = treebank.treebankfanout(trees) logging.info('binarized treebank fan-out: %d #%d', tbfanout, n) for n, stage in enumerate(stages): if stage.split: traintrees = [treetransforms.binarize( treetransforms.splitdiscnodes( Tree.convert(a), stage.markorigin), childchar=':', dot=True, ids=grammar.UniqueIDs()).freeze() for a in trees] logging.info('splitted discontinuous nodes') else: traintrees = trees if stage.mode.startswith('pcfg'): if tbfanout != 1 and not stage.split: raise ValueError('Cannot extract PCFG from treebank ' 'with discontinuities.') backtransform = extrarules = None if lexmodel and simplelexsmooth: extrarules = lexicon.simplesmoothlexicon(lexmodel) if stage.dop: if stage.dop == 'doubledop': (xgrammar, backtransform, altweights, fragments ) = grammar.doubledop( traintrees, sents, binarized=stage.binarized, iterate=stage.iterate, complement=stage.complement, numproc=numproc, extrarules=extrarules) # dump fragments with codecs.getwriter('utf-8')(gzip.open('%s/%s.fragments.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\t%d\n' % (treebank.writetree(a, b, 0, 'bracket' if stage.mode.startswith('pcfg') else 'discbracket').rstrip(), sum(c.values())) for (a, b), c in fragments.items()) elif stage.dop == 'reduction': xgrammar, altweights = grammar.dopreduction( traintrees, sents, packedgraph=stage.packedgraph, extrarules=extrarules) else: raise ValueError('unrecognized DOP model: %r' % stage.dop) nodes = sum(len(list(a.subtrees())) for a in traintrees) if lexmodel and not simplelexsmooth: # FIXME: altweights? xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel) msg = grammar.grammarinfo(xgrammar) rules, lex = grammar.write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) gram = Grammar(rules, lex, start=top, bitpar=stage.mode.startswith('pcfg'), binarized=stage.binarized) for name in altweights: gram.register(u'%s' % name, altweights[name]) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lex) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, len(gram.toid)) logging.info(msg) if stage.estimator != 'rfe': gram.switch(u'%s' % stage.estimator) logging.info(gram.testgrammar()[1]) if stage.dop == 'doubledop': # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('ascii')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\n' % a for a in backtransform) if n and stage.prune: msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop else re.compile(b'@.+$'), neverblockre=re.compile(b'.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes msg = gram.getmapping(None, striplabelre=None, neverblockre=re.compile(b'.+}<'), splitprune=False, markorigin=False) logging.info(msg) elif n and stage.prune: # dop reduction msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop and stages[n - 1].dop != 'doubledop' else re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': gram.getrulemapping( stages[n - 1].grammar, re.compile(br'@[-0-9]+\b')) logging.info(msg) # write prob models np.savez_compressed( # pylint: disable=no-member '%s/%s.probs.npz' % (resultdir, stage.name), **{name: mod for name, mod in zip(gram.modelnames, gram.models)}) else: # not stage.dop xgrammar = grammar.treebankgrammar(traintrees, sents, extrarules=extrarules) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammar.grammarinfo(xgrammar)) else: logging.info(grammar.grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) if lexmodel and not simplelexsmooth: xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel) rules, lex = grammar.write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) gram = Grammar(rules, lex, start=top, bitpar=stage.mode.startswith('pcfg')) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lex) logging.info(gram.testgrammar()[1]) if n and stage.prune: msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.dop == 'doubledop' else '') outside = None if stage.estimates in ('SX', 'SXlrgaps'): if stage.estimates == 'SX' and tbfanout != 1 and not stage.split: raise ValueError('SX estimate requires PCFG.') elif stage.mode != 'plcfrs': raise ValueError('estimates require parser w/agenda.') begin = time.clock() logging.info('computing %s estimates', stage.estimates) if stage.estimates == 'SX': outside = estimates.getpcfgestimates(gram, testmaxwords, gram.toid[trees[0].label]) elif stage.estimates == 'SXlrgaps': outside = estimates.getestimates(gram, testmaxwords, gram.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez_compressed( # pylint: disable=no-member '%s/%s.outside.npz' % (resultdir, stage.name), outside=outside) logging.info('saved %s estimates', stage.estimates) elif stage.estimates: raise ValueError('unrecognized value; specify SX or SXlrgaps.') stage.update(grammar=gram, backtransform=backtransform, outside=outside)