def annotate(sentno): """Serve the main annotation page for a sentence.""" username = session['username'] if sentno == -1: sentno = firstunannotated(username) redirect(url_for('annotate', sentno=sentno)) session['actions'] = [0, 0, 0, 0, 0, 0, 0, time()] lineno = QUEUE[sentno - 1][0] sent = SENTENCES[lineno] senttok, _ = worker.postokenize(sent) annotation, n = getannotation(username, lineno) if annotation is not None: item = exporttree(annotation.splitlines(), functions='add') canonicalize(item.tree) worker.domorph(item.tree) tree = writediscbrackettree(item.tree, item.sent) return redirect( url_for('edit', sentno=sentno, annotated=1, tree=tree, n=n)) return render_template('annotate.html', prevlink=str(sentno - 1) if sentno > 1 else '#', nextlink=str(sentno + 1) if sentno < len(SENTENCES) else '#', sentno=sentno, lineno=lineno + 1, totalsents=len(SENTENCES), numannotated=numannotated(username), annotationhelp=ANNOTATIONHELP, sent=' '.join(senttok))
def test_optimalbinarize(): """Verify that all optimal parsing complexities are lower than or equal to the complexities of right-to-left binarizations.""" from discodop.treetransforms import optimalbinarize, complexityfanout from discodop.treebank import NegraCorpusReader corpus = NegraCorpusReader('alpinosample.export', punct='move') total = violations = violationshd = 0 for n, (tree, sent) in enumerate(zip(list( corpus.trees().values())[:-2000], corpus.sents().values())): t = addbitsets(tree) if all(fanout(x) == 1 for x in t.subtrees()): continue print(n, tree, '\n', ' '.join(sent)) total += 1 optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1) # undo head-ordering to get a normal right-to-left binarization normbin = addbitsets(binarize(canonicalize(Tree.convert(tree)))) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('non-hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violations += 1 optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1) normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1)) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violationshd += 1 print('opt. bin. violations normal: %d / %d; hd: %d / %d' % ( violations, total, violationshd, total)) assert violations == violationshd == 0
def generate(url): yield ('<!doctype html>' '<title>redirect</title>' 'You were logged in successfully. ') if username in WORKERS: try: _ = WORKERS[username].submit(worker.getprop, 'headrules').result() except BrokenProcessPool: pass # fall through else: yield "<script>window.location.replace('%s');</script>" % url return yield 'Loading grammar; this will take a few seconds ...' _, lang = os.path.split(os.path.basename(app.config['GRAMMAR'])) app.logger.info('Loading grammar %r', lang) pool = ProcessPoolExecutor(max_workers=1) if False and app.config['DEBUG']: from discodop.treesearch import NoFuture future = NoFuture(worker.loadgrammar, app.config['GRAMMAR'], app.config['LIMIT']) else: future = pool.submit(worker.loadgrammar, app.config['GRAMMAR'], app.config['LIMIT']) future.result() app.logger.info('Grammar %r loaded.', lang) # train on annotated sentences annotations = readannotations() if annotations: app.logger.info('training on %d previously annotated sentences', len(annotations)) trees, sents = [], [] headrules = pool.submit(worker.getprop, 'headrules').result() for block in annotations.values(): item = exporttree(block.splitlines()) canonicalize(item.tree) applyheadrules(item.tree, headrules) trees.append(item.tree) sents.append(item.sent) if False and app.config['DEBUG']: future = NoFuture(worker.augment, trees, sents) else: future = pool.submit(worker.augment, trees, sents) future.result() WORKERS[username] = pool yield "<script>window.location.replace('%s');</script>" % url
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [ transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values()) ] print('\ntransformed') correct = exact = e = 0 for a, b, c, d in islice( zip(n.trees().values(), n.sents().values(), trees, count()), 100): transformc = reversetransform(c.copy(True), b, transformations) c1 = bracketings(canonicalize(a)) c2 = bracketings(canonicalize(transformc)) z = -1 # 825 if c1 != c2 or e == z: precision = len(set(c1) & set(c2)) / len(set(c1)) recall = len(set(c1) & set(c2)) / len(set(c2)) if precision != 1.0 or recall != 1.0 or d == z: print( d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(b))) print('no match', precision, recall) print(len(c1), len(c2), 'gold-transformed', set(c2) - set(c1), 'transformed-gold', set(c1) - set(c2)) print(a) print(transformc) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 e += 1 print('matches', correct, '/', e, 100 * correct / e, '%') print('exact', exact)
def testtransforms(): """ Test whether the Tiger transformations (transform / reversetransform) are reversible. """ from discodop.treetransforms import canonicalize from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules) nn = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.parsed_sents().values(), nn.sents().values())] print('\ntransformed') correct = exact = d = 0 for a, b, c in islice(zip(n.parsed_sents().values(), trees, n.sents().values()), 100): transformb = reversetransform(b.copy(True), transformations) b1 = bracketings(canonicalize(a)) b2 = bracketings(canonicalize(transformb)) z = -1 # 825 if b1 != b2 or d == z: precision = len(set(b1) & set(b2)) / len(set(b1)) recall = len(set(b1) & set(b2)) / len(set(b2)) if precision != 1.0 or recall != 1.0 or d == z: print(d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(c))) print('no match', precision, recall) print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1), 'transformed-gold', set(b1) - set(b2)) print(a) print(transformb) handlefunctions('add', a) print(a) print(b) print() else: correct += 1 else: exact += 1 correct += 1 d += 1 print('matches', correct, '/', d, 100 * correct / d, '%') print('exact', exact)
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] print('\ntransformed') correct = exact = e = 0 for a, b, c, d in islice(zip(n.trees().values(), n.sents().values(), trees, count()), 100): transformc = reversetransform(c.copy(True), transformations) c1 = bracketings(canonicalize(a)) c2 = bracketings(canonicalize(transformc)) z = -1 # 825 if c1 != c2 or e == z: precision = len(set(c1) & set(c2)) / len(set(c1)) recall = len(set(c1) & set(c2)) / len(set(c2)) if precision != 1.0 or recall != 1.0 or d == z: print(d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(b))) print('no match', precision, recall) print(len(c1), len(c2), 'gold-transformed', set(c2) - set(c1), 'transformed-gold', set(c1) - set(c2)) print(a) print(transformc) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 e += 1 print('matches', correct, '/', e, 100 * correct / e, '%') print('exact', exact)
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in ( ('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] for a, b in islice(zip(n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform(b.copy(True), transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % ( transformations, before, after))
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in (('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [ transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values()) ] for a, b in islice(zip(n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform(b.copy(True), transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % (transformations, before, after))
def parse(args, stdinput): """Parse a given sentence after inducing a grammar from a given corpus. Parameters ---------- args : list(str) The list of arguments: corpus file, sentence. stdinput : list(str) The pruning policy which should be used for the parsing process. """ # assign the parameter values corpus = TigerXMLCorpusReader(args[0], encoding='utf8') sent = args[1] # create grammar and gold trees trees = [ ImmutableTree.convert(canonicalize(t)) for t in list(corpus.trees().values()) ] sentences = list(corpus.sents().values()) grammar = Grammar(trees, sentences) goldtrees = [t for s, t in zip(sentences, trees) if ' '.join(s) == sent] # create initial pruning policy pp = deserialize(stdinput, FEATURES, grammar) if\ stdinput else PruningPolicy() # create derivation tree parser = Parser(grammar) derivationgraph = parser.parse(sent, pp) derivationtree = derivationgraph.get_tree() stdout.flush() # print results if isinstance(derivationtree, Tree): # print graphical representation if the sentence could be parsed print(derivationtree.pprint()) drawtree = DrawTree(derivationtree, sent.split()) print("\n derivation tree: \n" + drawtree.text()) else: # otherwise print a error message print(derivationtree) if len(goldtrees) > 0: # print graphical representation if there is a gold tree drawgold = DrawTree(goldtrees[0], sent.split()) print("\n gold tree: \n" + drawgold.text()) # print recall if both trees are available if isinstance(derivationtree, Tree): print("\n recall: %f" % accuracy(derivationtree, goldtrees[0]))
def dobinarization(trees, sents, binarization, relationalrealizational): """Apply binarization.""" # fixme: this n should correspond to sentence id tbfanout, n = treebank.treebankfanout(trees) logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = time.clock() msg = 'binarization: %s' % binarization.method if binarization.fanout_marks_before_bin: trees = [treetransforms.addfanoutmarkers(t) for t in trees] if binarization.method is None: pass elif binarization.method == 'default': msg += ' %s h=%d v=%d %s' % ( binarization.factor, binarization.h, binarization.v, 'tailmarker' if binarization.tailmarker else '') for a in trees: treetransforms.binarize(a, factor=binarization.factor, tailmarker=binarization.tailmarker, horzmarkov=binarization.h, vertmarkov=binarization.v, leftmostunary=binarization.leftmostunary, rightmostunary=binarization.rightmostunary, reverse=binarization.revmarkov, headidx=-1 if binarization.markhead else None, filterfuncs=(relationalrealizational['ignorefunctions'] + (relationalrealizational['adjunctionlabel'], )) if relationalrealizational else (), labelfun=binarization.labelfun) elif binarization.method == 'optimal': trees = [Tree.convert(treetransforms.optimalbinarize(tree)) for n, tree in enumerate(trees)] elif binarization.method == 'optimalhead': msg += ' h=%d v=%d' % ( binarization.h, binarization.v) trees = [Tree.convert(treetransforms.optimalbinarize( tree, headdriven=True, h=binarization.h, v=binarization.v)) for n, tree in enumerate(trees)] trees = [treetransforms.addfanoutmarkers(t) for t in trees] logging.info('%s; cpu time elapsed: %gs', msg, time.clock() - begin) trees = [treetransforms.canonicalize(a).freeze() for a in trees] return trees
def replacesubtree(): n = int(request.args.get('n', 0)) sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) username = session['username'] treestr = request.args.get('tree', '') try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) error = '' dt = DrawTree(tree, senttok) _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) subseq = sorted(dt.nodes[nodeid].leaves()) subsent = ' '.join(senttok[n] for n in subseq) root = dt.nodes[nodeid].label resp = WORKERS[username].submit(worker.getparses, subsent, (), (), root=root).result() _senttok, parsetrees, _messages, _elapsed = resp newsubtree = parsetrees[n - 1][1] pos = sorted(list(newsubtree.subtrees(lambda n: isinstance(n[0], int))), key=lambda n: n[0]) for n, a in enumerate(pos): a[0] = subseq[n] dt.nodes[nodeid][:] = newsubtree[:] tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip() session['actions'][REPARSE] += 1 session.modified = True link = ('<a href="/annotate/accept?%s">accept this tree</a>' % urlencode(dict(sentno=sentno, tree=treestr))) return Markup('%s\n\n%s%s\t%s' % (link, error, dt.text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t0'), treestr))
def train(args, stdinput): """Train a given pruning policy. Parameters ---------- args : list(str) List of arguments: corpus file, number of iterations, weight. stdinput : list(str) The initial pruning policy as std input. """ # assign the parameter values pp = PruningPolicy(stdinput) corpus = TigerXMLCorpusReader(args[0], encoding='utf8') iterations = int(args[1]) if len(args) >= 2 and args[1] else 1 weight = float(args[2]) if len(args) >= 3 and args[2] else 1 slength = int(args[3]) if len(args) >= 4 and args[3] else maxsize nsents = int(args[4]) if len(args) >= 5 and args[4] else maxsize feats = str(args[5]).split('-') if len(args) >= 6 else None # create grammar and corpus trees = [ ImmutableTree.convert(canonicalize(t)) for t in list(corpus.trees().values()) ] sentences = list(corpus.sents().values()) grammar = Grammar(trees, sentences) simplecorpus = [(s, _t) for s, _t in list(zip(sentences, trees)) if len(s) <= slength] if nsents: simplecorpus = simplecorpus[:nsents] # print the trained pruning policy into the console newpp = lols(grammar, simplecorpus, pp, iterations, weight, feats) stdout.write(newpp.serialize())
def main(): """Command line interface to create grammars from treebanks.""" from getopt import gnu_getopt, GetoptError from discodop.treetransforms import addfanoutmarkers, canonicalize logging.basicConfig(level=logging.DEBUG, format='%(message)s') shortoptions = 's:' options = 'gzip packed bitpar inputfmt= inputenc= dopestimator= numproc=' try: opts, args = gnu_getopt(sys.argv[1:], shortoptions, options.split()) model = args[0] if model not in ('info', 'merge'): treebankfile, grammarfile = args[1:] except (GetoptError, IndexError, ValueError) as err: print('error: %r\n%s' % (err, USAGE)) sys.exit(2) opts = dict(opts) if model not in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop', 'ptsg', 'param', 'info', 'merge'): raise ValueError('unrecognized model: %r' % model) if opts.get('dopestimator', 'rfe') not in ('rfe', 'ewe', 'shortest'): raise ValueError('unrecognized estimator: %r' % opts['dopestimator']) if model == 'info': grammarstats(args[1]) return elif model == 'merge': if len(args) < 5: raise ValueError('need at least 2 input and 1 output arguments.') if args[1] == 'rules': merge(args[2:-1], args[-1], sumrules, stripweight) elif args[1] == 'lexicon': merge(args[2:-1], args[-1], sumlex, lambda x: x.split(None, 1)[0]) elif args[1] == 'fragments': merge(args[2:-1], args[-1], sumfrags, lambda x: x.rsplit('\t', 1)[0]) return elif model == 'param': import os from discodop.runexp import readparam, loadtraincorpus, getposmodel from discodop.parser import DictObj if opts: raise ValueError('all options should be set in parameter file.') prm = DictObj(readparam(args[1])) resultdir = args[2] if os.path.exists(resultdir): raise ValueError('Directory %r already exists.\n' % resultdir) os.mkdir(resultdir) trees, sents, train_tagged_sents = loadtraincorpus( prm.corpusfmt, prm.traincorpus, prm.binarization, prm.punct, prm.functions, prm.morphology, prm.removeempty, prm.ensureroot, prm.transformations, prm.relationalrealizational) simplelexsmooth = False if prm.postagging and prm.postagging.method == 'unknownword': sents, lexmodel = getposmodel(prm.postagging, train_tagged_sents) simplelexsmooth = prm.postagging.simplelexsmooth elif model == 'ptsg': # read fragments splittedlines = (line.split('\t') for line in io.open(treebankfile, encoding=opts.get('--inputenc', 'utf8'))) fragments = {(fields[0] if len(fields) == 2 else (fields[0], [a or None for a in fields[1].split(' ')])): convertweight(fields[-1]) for fields in splittedlines} else: # read treebank corpus = READERS[opts.get('--inputfmt', 'export')]( treebankfile, encoding=opts.get('--inputenc', 'utf8')) trees = list(corpus.trees().values()) sents = list(corpus.sents().values()) if not trees: raise ValueError('no trees; is --inputfmt correct?') for a in trees: canonicalize(a) addfanoutmarkers(a) # read off grammar if model in ('pcfg', 'plcfrs'): grammar = treebankgrammar(trees, sents) elif model == 'dopreduction': grammar, altweights = dopreduction(trees, sents, packedgraph='--packed' in opts) elif model == 'doubledop': grammar, backtransform, altweights, _ = doubledop(trees, sents, numproc=int(opts.get('--numproc', 1)), binarized='--bitpar' not in opts) elif model == 'ptsg': grammar, backtransform, altweights = compiletsg(fragments, binarized='--bitpar' not in opts) elif model == 'param': from discodop.runexp import dobinarization, getgrammars getgrammars(dobinarization(trees, sents, prm.binarization, prm.relationalrealizational), sents, prm.stages, prm.testcorpus.maxwords, resultdir, prm.numproc, lexmodel, simplelexsmooth, trees[0].label) open(os.path.join(resultdir, 'params.prm'), "w").write( "top='%s',\n%s" % (trees[0].label, open(args[1]).read())) return # grammars have already been written if opts.get('--dopestimator', 'rfe') != 'rfe': grammar = [(rule, w) for (rule, _), w in zip(grammar, altweights[opts['--dopestimator']])] rulesname = grammarfile + '.rules' lexiconname = grammarfile + '.lex' myopen = open if '--gzip' in opts: myopen = gzip.open rulesname += '.gz' lexiconname += '.gz' bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket' if model == 'ptsg': bitpar = not isinstance(next(iter(fragments)), tuple) if '--bitpar' in opts and not bitpar: raise ValueError('parsing with an unbinarized grammar requires ' 'a grammar in bitpar format.') rules, lexicon = write_lcfrs_grammar(grammar, bitpar=bitpar) # write output with myopen(rulesname, 'w') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(myopen(lexiconname, 'w')) as lexiconfile: lexiconfile.write(lexicon) if model in ('doubledop', 'ptsg'): backtransformfile = '%s.backtransform%s' % (grammarfile, '.gz' if '--gzip' in opts else '') myopen(backtransformfile, 'w').writelines( '%s\n' % a for a in backtransform) print('wrote backtransform to', backtransformfile) print('wrote grammar to %s and %s.' % (rulesname, lexiconname)) if len(grammar) < 10000: # this is very slow so skip with large grammars print(grammarinfo(grammar)) try: from discodop.containers import Grammar print(Grammar(rules, lexicon, bitpar=bitpar, binarized='--bitpar' not in opts, start=opts.get('-s', next(iter(grammar))[0][0][0] if model == 'ptsg' else trees[0].label)).testgrammar()[1]) except (ImportError, AssertionError) as err: print(err)
def reattach(): """Re-draw tree after re-attaching node under new parent.""" sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) treestr = request.args.get('tree', '') try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) dt = DrawTree(tree, senttok) error = '' if request.args.get('newparent') == 'deletenode': # remove nodeid by replacing it with its children _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) x = dt.nodes[nodeid] if nodeid == 0 or isinstance(x[0], int): error = 'ERROR: cannot remove ROOT or POS node' else: children = list(x) x[:] = [] for y in dt.nodes[0].subtrees(): if any(child is x for child in y): i = y.index(x) y[i:i + 1] = children tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. break elif request.args.get('nodeid', '').startswith('newlabel_'): # splice in a new node under parentid _treeid, newparent = request.args.get('newparent', '').lstrip('t').split('_') newparent = int(newparent) label = request.args.get('nodeid').split('_', 1)[1] y = dt.nodes[newparent] if isinstance(y[0], int): error = 'ERROR: cannot add node under POS tag' else: children = list(y) y[:] = [] y[:] = [Tree(label, children)] tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. else: # re-attach existing node at existing new parent _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) _treeid, newparent = request.args.get('newparent', '').lstrip('t').split('_') newparent = int(newparent) # remove node from old parent # dt.nodes[nodeid].parent.pop(dt.nodes[nodeid].parent_index) x = dt.nodes[nodeid] y = dt.nodes[newparent] for node in x.subtrees(): if node is y: error = ('ERROR: cannot re-attach subtree' ' under (descendant of) itself\n') break else: for node in dt.nodes[0].subtrees(): if any(child is x for child in node): if len(node) > 1: node.remove(x) dt.nodes[newparent].append(x) tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. else: error = ('ERROR: re-attaching only child creates' ' empty node %s; remove manually\n' % node) break treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip() link = ('<a href="/annotate/accept?%s">accept this tree</a>' % urlencode(dict(sentno=sentno, tree=treestr))) if error == '': session['actions'][REATTACH] += 1 session.modified = True return Markup('%s\n\n%s%s\t%s' % (link, error, dt.text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t0'), treestr))
def getgrammars(trees, sents, stages, bintype, horzmarkov, vertmarkov, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top, relationalrealizational): """ Apply binarization and read off the requested grammars. """ # fixme: this n should correspond to sentence id tbfanout, n = treebankfanout(trees) logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = time.clock() if fanout_marks_before_bin: trees = [addfanoutmarkers(t) for t in trees] if bintype == 'binarize': bintype += ' %s h=%d v=%d %s' % (factor, horzmarkov, vertmarkov, 'tailmarker' if tailmarker else '') for a in trees: binarize(a, factor=factor, tailmarker=tailmarker, horzmarkov=horzmarkov, vertmarkov=vertmarkov, leftmostunary=leftmostunary, rightmostunary=rightmostunary, reverse=revmarkov, pospa=pospa, headidx=-1 if markhead else None, filterfuncs=(relationalrealizational['ignorefunctions'] + (relationalrealizational['adjunctionlabel'], )) if relationalrealizational else ()) elif bintype == 'optimal': trees = [Tree.convert(optimalbinarize(tree)) for n, tree in enumerate(trees)] elif bintype == 'optimalhead': trees = [Tree.convert(optimalbinarize(tree, headdriven=True, h=horzmarkov, v=vertmarkov)) for n, tree in enumerate(trees)] trees = [addfanoutmarkers(t) for t in trees] logging.info('binarized %s cpu time elapsed: %gs', bintype, time.clock() - begin) logging.info('binarized treebank fan-out: %d #%d', *treebankfanout(trees)) trees = [canonicalize(a).freeze() for a in trees] for n, stage in enumerate(stages): if stage.split: traintrees = [binarize(splitdiscnodes(Tree.convert(a), stage.markorigin), childchar=':').freeze() for a in trees] logging.info('splitted discontinuous nodes') else: traintrees = trees if stage.mode.startswith('pcfg'): assert tbfanout == 1 or stage.split backtransform = None if stage.dop: if stage.usedoubledop: # find recurring fragments in treebank, # as well as depth 1 'cover' fragments fragments = getfragments(traintrees, sents, numproc, iterate=stage.iterate, complement=stage.complement) xgrammar, backtransform, altweights = doubledop( traintrees, fragments) else: # DOP reduction xgrammar, altweights = dopreduction( traintrees, sents, packedgraph=stage.packedgraph) nodes = sum(len(list(a.subtrees())) for a in traintrees) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) for weights in altweights.values(): weights.extend(w for _, w in newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) msg = grammarinfo(xgrammar) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) for name in altweights: grammar.register(u'%s' % name, altweights[name]) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, len(grammar.toid)) logging.info(msg) if stage.estimator != 'dop1': grammar.switch(u'%s' % stage.estimator) _sumsto1 = grammar.testgrammar() if stage.usedoubledop: # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('ascii')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\n' % a for a in backtransform) if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop else re.compile(b'@.+$'), neverblockre=re.compile(b'.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes msg = grammar.getmapping(None, striplabelre=None, neverblockre=re.compile(b'.+}<'), splitprune=False, markorigin=False) logging.info(msg) elif n and stage.prune: # dop reduction msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop and not stages[n - 1].usedoubledop else re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': grammar.getrulemapping(stages[n - 1].grammar) logging.info(msg) # write prob models np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name), **{name: mod for name, mod in zip(grammar.modelnames, grammar.models)}) else: # not stage.dop xgrammar = treebankgrammar(traintrees, sents) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammarinfo(xgrammar)) else: logging.info(grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) _sumsto1 = grammar.testgrammar() if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.usedoubledop else '') outside = None if stage.getestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' logging.info('computing PCFG estimates') begin = time.clock() outside = getpcfgestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('pcfgoutside.npz', outside=outside) logging.info('saved PCFG estimates') elif stage.useestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' assert stage.mode != 'pcfg', ( 'estimates require agenda-based parser.') outside = np.load('pcfgoutside.npz')['outside'] logging.info('loaded PCFG estimates') if stage.getestimates == 'SXlrgaps': logging.info('computing PLCFRS estimates') begin = time.clock() outside = getestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('outside.npz', outside=outside) logging.info('saved estimates') elif stage.useestimates == 'SXlrgaps': outside = np.load('outside.npz')['outside'] logging.info('loaded PLCFRS estimates') stage.update(grammar=grammar, backtransform=backtransform, outside=outside)
def main(): """ Command line interface to create grammars from treebanks. """ import gzip from getopt import gnu_getopt, GetoptError from discodop.treetransforms import addfanoutmarkers, canonicalize from discodop.treebank import getreader, splitpath from discodop.fragments import getfragments logging.basicConfig(level=logging.DEBUG, format='%(message)s') shortoptions = '' flags = ('gzip', 'packed') options = ('inputfmt=', 'inputenc=', 'dopestimator=', 'numproc=') try: opts, args = gnu_getopt(sys.argv[1:], shortoptions, flags + options) model, treebankfile, grammarfile = args except (GetoptError, ValueError) as err: print('error: %r\n%s' % (err, USAGE)) sys.exit(2) opts = dict(opts) assert model in ('pcfg', 'plcfrs', 'dopreduction', 'doubledop'), ( 'unrecognized model: %r' % model) assert opts.get('dopestimator', 'dop1') in ('dop1', 'ewe', 'shortest'), ( 'unrecognized estimator: %r' % opts['dopestimator']) # read treebank reader = getreader(opts.get('--inputfmt', 'export')) corpus = reader(*splitpath(treebankfile), encoding=opts.get('--inputenc', 'utf8')) trees = list(corpus.parsed_sents().values()) sents = list(corpus.sents().values()) for a in trees: canonicalize(a) addfanoutmarkers(a) # read off grammar if model in ('pcfg', 'plcfrs'): grammar = treebankgrammar(trees, sents) elif model == 'dopreduction': grammar, altweights = dopreduction(trees, sents, packedgraph='--packed' in opts) elif model == 'doubledop': numproc = int(opts.get('--numproc', 1)) fragments = getfragments(trees, sents, numproc) grammar, backtransform, altweights = doubledop(trees, fragments) if opts.get('--dopestimator', 'dop1') == 'ewe': grammar = [(rule, w) for (rule, _), w in zip(grammar, altweights['ewe'])] elif opts.get('--dopestimator', 'dop1') == 'shortest': grammar = [(rule, w) for (rule, _), w in zip(grammar, altweights['shortest'])] print(grammarinfo(grammar)) rules = grammarfile + '.rules' lexicon = grammarfile + '.lex' if '--gzip' in opts: myopen = gzip.open rules += '.gz' lexicon += '.gz' else: myopen = open bitpar = model == 'pcfg' or opts.get('--inputfmt') == 'bracket' rules, lexicon = write_lcfrs_grammar(grammar, bitpar=bitpar) try: from discodop.containers import Grammar except ImportError: pass else: cgrammar = Grammar(rules, lexicon) cgrammar.testgrammar() # write output with myopen(rules, 'w') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(myopen(lexicon, 'w')) as lexiconfile: lexiconfile.write(lexicon) if model == 'doubledop': backtransformfile = '%s.backtransform%s' % (grammarfile, '.gz' if '--gzip' in opts else '') myopen(backtransformfile, 'w').writelines( '%s\n' % a for a in backtransform) print('wrote backtransform to', backtransformfile) print('wrote grammar to %s and %s.' % (rules, lexicon))