def loadtraincorpus(corpusfmt, traincorpus, binarization, punct, functions, morphology, removeempty, ensureroot, transformations, relationalrealizational): """Load the training corpus.""" train = treebank.READERS[corpusfmt](traincorpus.path, encoding=traincorpus.encoding, headrules=binarization.headrules, headfinal=True, headreverse=False, removeempty=removeempty, ensureroot=ensureroot, punct=punct, functions=functions, morphology=morphology) if isinstance(traincorpus.numsents, float): traincorpus.numsents = int(traincorpus.numsents * len(train.sents())) traintrees = train.itertrees(None, traincorpus.numsents) trees, sents = zip(*[treesent for _, treesent in traintrees if 1 <= len(treesent[1]) <= traincorpus.maxwords]) logging.info('%d training sentences after length restriction <= %d', len(trees), traincorpus.maxwords) if not trees: raise ValueError('training corpus (selection) should be non-empty.') if transformations: trees = [treebanktransforms.transform(tree, sent, transformations) for tree, sent in zip(trees, sents)] if relationalrealizational: trees = [treebanktransforms.rrtransform( tree, **relationalrealizational)[0] for tree in trees] train_tagged_sents = [[(word, tag) for word, (_, tag) in zip(sent, sorted(tree.pos()))] for tree, sent in zip(trees, sents)] return trees, sents, train_tagged_sents
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [ transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values()) ] print('\ntransformed') correct = exact = e = 0 for a, b, c, d in islice( zip(n.trees().values(), n.sents().values(), trees, count()), 100): transformc = reversetransform(c.copy(True), b, transformations) c1 = bracketings(canonicalize(a)) c2 = bracketings(canonicalize(transformc)) z = -1 # 825 if c1 != c2 or e == z: precision = len(set(c1) & set(c2)) / len(set(c1)) recall = len(set(c1) & set(c2)) / len(set(c2)) if precision != 1.0 or recall != 1.0 or d == z: print( d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(b))) print('no match', precision, recall) print(len(c1), len(c2), 'gold-transformed', set(c2) - set(c1), 'transformed-gold', set(c1) - set(c2)) print(a) print(transformc) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 e += 1 print('matches', correct, '/', e, 100 * correct / e, '%') print('exact', exact)
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] print('\ntransformed') correct = exact = d = 0 for a, b, c in islice(zip(n.trees().values(), trees, n.sents().values()), 100): transformb = reversetransform(b.copy(True), transformations) b1 = bracketings(canonicalize(a)) b2 = bracketings(canonicalize(transformb)) z = -1 # 825 if b1 != b2 or d == z: precision = len(set(b1) & set(b2)) / len(set(b1)) recall = len(set(b1) & set(b2)) / len(set(b2)) if precision != 1.0 or recall != 1.0 or d == z: print(d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(c))) print('no match', precision, recall) print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1), 'transformed-gold', set(b1) - set(b2)) print(a) print(transformb) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 d += 1 print('matches', correct, '/', d, 100 * correct / d, '%') print('exact', exact)
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in (('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [ transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values()) ] for a, b in islice(zip(n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform(b.copy(True), transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % (transformations, before, after))
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in ( ('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] for a, b in islice(zip(n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform(b.copy(True), transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % ( transformations, before, after))
def main(): """Command line interface for applying tree(bank) transforms.""" import io from getopt import gnu_getopt, GetoptError from discodop import treebanktransforms actions = {'none': None, 'introducepreterminals': introducepreterminals, 'splitdisc': None, 'mergedisc': mergediscnodes, 'transform': None, 'unbinarize': unbinarize, 'binarize': None, 'optimalbinarize': None} flags = ('markorigin markheads leftunary rightunary tailmarker ' 'renumber reverse'.split()) options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= ' 'punct= headrules= functions= morphology= lemmas= factor= ' 'markorigin= maxlen= fmt= enc= transforms=').split() try: opts, args = gnu_getopt(sys.argv[1:], 'h:v:', flags + options) if not 1 <= len(args) <= 3: raise GetoptError('error: expected 1, 2, or 3 positional arguments') except GetoptError as err: print('error: %r\n%s' % (err, USAGE), file=sys.stderr) sys.exit(2) opts, action = dict(opts), args[0] if action not in actions: print('unrecognized action: %r\navailable actions: %s' % ( action, ', '.join(actions)), file=sys.stderr) sys.exit(2) if '--fmt' in opts: opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt'] if '--enc' in opts: opts['--inputenc'] = opts['--outputenc'] = opts['--enc'] if opts.get('--outputfmt', WRITERS[0]) not in WRITERS: print('unrecognized output format: %r\navailable formats: %s' % ( opts.get('--outputfmt'), ' '.join(WRITERS)), file=sys.stderr) sys.exit(2) infilename = args[1] if len(args) >= 2 and args[1] != '-' else '/dev/stdin' outfilename = args[2] if len(args) == 3 and args[2] != '-' else '/dev/stdout' # open corpus corpus = READERS[opts.get('--inputfmt', 'export')]( infilename, encoding=opts.get('--inputenc', 'utf-8'), headrules=opts.get('--headrules'), markheads='--markheads' in opts, ensureroot=opts.get('--ensureroot'), punct=opts.get('--punct'), functions=opts.get('--functions'), morphology=opts.get('--morphology'), lemmas=opts.get('--lemmas')) start, end = opts.get('--slice', ':').split(':') start, end = (int(start) if start else None), (int(end) if end else None) trees = corpus.itertrees(start, end) if '--maxlen' in opts: maxlen = int(opts['--maxlen']) trees = ((key, (tree, sent)) for key, (tree, sent) in trees if len(sent) <= maxlen) if '--renumber' in opts: trees = (('%8d' % n, treesent) for n, (_, treesent) in enumerate(trees, 1)) # select transformation transform = actions[action] if action in ('binarize', 'optimalbinarize'): h = int(opts.get('-h', 999)) v = int(opts.get('-v', 1)) if action == 'binarize': factor = opts.get('--factor', 'right') transform = lambda t, _: binarize(t, factor, h, v, leftmostunary='--leftunary' in opts, rightmostunary='--rightunary' in opts, tailmarker='$' if '--tailmarker' in opts else '') elif action == 'optimalbinarize': headdriven = '--headrules' in opts transform = lambda t, _: optimalbinarize(t, '|', headdriven, h, v) elif action == 'splitdisc': transform = lambda t, _: splitdiscnodes(t, '--markorigin' in opts) elif action == 'unbinarize': transform = lambda t, _: unbinarize(Tree.convert(t)) elif action == 'transform': tfs = opts['--transforms'].split(',') transform = lambda t, s: (treebanktransforms.reversetransform(t, tfs) if '--reverse' in opts else treebanktransforms.transform(t, s, tfs)) if transform is not None: # NB: transform cannot affect (no. of) terminals trees = ((key, (transform(tree, sent), sent)) for key, (tree, sent) in trees) # read, transform, & write trees headrules = None if opts.get('--outputfmt') in ('mst', 'conll'): if not opts.get('--headrules'): raise ValueError('need head rules for dependency conversion') headrules = treebanktransforms.readheadrules(opts.get('--headrules')) cnt = 0 if opts.get('--outputfmt') == 'dact': import alpinocorpus outfile = alpinocorpus.CorpusWriter(outfilename) if (action == 'none' and opts.get('--inputfmt') in ('alpino', 'dact') and set(opts) <= {'--slice', '--inputfmt', '--outputfmt', '--renumber'}): for n, (key, block) in islice(enumerate( corpus.blocks().items(), 1), start, end): outfile.write('%8d' % n if '--renumber' in opts else key, block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(str(key), writetree(tree, sent, key, 'alpino')) cnt += 1 else: encoding = opts.get('outputenc', 'utf-8') outfile = io.open(outfilename, 'w', encoding=encoding) # copy trees verbatim when only taking slice or converting encoding if (action == 'none' and opts.get('--inputfmt') == opts.get( '--outputfmt') and set(opts) <= {'--slice', '--inputenc', '--outputenc', '--inputfmt', '--outputfmt'}): for block in islice(corpus.blocks().values(), start, end): outfile.write(block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(writetree(tree, sent, key, opts.get('--outputfmt', 'export'), headrules)) cnt += 1 print('%sed %d trees with action %r' % ('convert' if action == 'none' else 'transform', cnt, action), file=sys.stderr)
def parsetepacoc( stages=(dict(mode='pcfg', split=True, markorigin=True), dict(mode='plcfrs', prune=True, k=10000, splitprune=True), dict(mode='plcfrs', prune=True, k=5000, dop=True, usedoubledop=True, estimator='dop1', objective='mpp', sample=False, kbest=True)), trainmaxwords=999, trainnumsents=25005, testmaxwords=999, bintype='binarize', h=1, v=1, factor='right', tailmarker='', markhead=False, revmarkov=False, pospa=False, leftmostunary=True, rightmostunary=True, fanout_marks_before_bin=False, transformations=None, usetagger='stanford', resultdir='tepacoc', numproc=1): """ Parse the tepacoc test set. """ for stage in stages: for key in stage: assert key in DEFAULTSTAGE, 'unrecognized option: %r' % key stages = [DictObj({k: stage.get(k, v) for k, v in DEFAULTSTAGE.items()}) for stage in stages] os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' logging.basicConfig(level=logging.DEBUG, format=formatstr) # log up to INFO to a results log file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) fileobj.setLevel(logging.INFO) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) tepacocids, tepacocsents = readtepacoc() try: (corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks) = pickle.load( gzip.open('tiger.pickle.gz', 'rb')) except IOError: # file not found corpus = getreader('export')('../tiger/corpus', 'tiger_release_aug07.export', headrules='negra.headrules' if bintype == 'binarize' else None, headfinal=True, headreverse=False, punct='move', encoding='iso-8859-1') corpus_sents = list(corpus.sents().values()) corpus_taggedsents = list(corpus.tagged_sents().values()) corpus_trees = list(corpus.parsed_sents().values()) if transformations: corpus_trees = [transform(tree, sent, transformations) for tree, sent in zip(corpus_trees, corpus_sents)] corpus_blocks = list(corpus.blocks().values()) pickle.dump((corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks), gzip.open('tiger.pickle.gz', 'wb'), protocol=-1) # test sets (one for each category) testsets = {} allsents = [] for cat, catsents in tepacocsents.items(): testset = sents, trees, goldsents, blocks = [], [], [], [] for n, sent in catsents: if sent != corpus_sents[n]: logging.error( 'mismatch. sent %d:\n%r\n%r\n' 'not in corpus %r\nnot in tepacoc %r', n + 1, sent, corpus_sents[n], [a for a, b in zip_longest(sent, corpus_sents[n]) if a and a != b], [b for a, b in zip_longest(sent, corpus_sents[n]) if b and a != b]) elif len(corpus_sents[n]) <= testmaxwords: sents.append(corpus_taggedsents[n]) trees.append(corpus_trees[n]) goldsents.append(corpus_taggedsents[n]) blocks.append(corpus_blocks[n]) allsents.extend(sents) logging.info('category: %s, %d of %d sentences', cat, len(testset[0]), len(catsents)) testsets[cat] = testset testsets['baseline'] = zip(*[sent for n, sent in enumerate(zip(corpus_taggedsents, corpus_trees, corpus_taggedsents, corpus_blocks)) if len(sent[1]) <= trainmaxwords and n not in tepacocids][trainnumsents:trainnumsents + 2000]) allsents.extend(testsets['baseline'][0]) if usetagger: overridetags = ('PTKANT', 'VAIMP') taglex = defaultdict(set) for sent in corpus_taggedsents[:trainnumsents]: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV', 'PIDAT': 'PIAT'} # the sentences in the list allsents are modified in-place so that # the relevant copy in testsets[cat][0] is updated as well. externaltagging(usetagger, '', allsents, overridetagdict, tagmap) # training set trees, sents, blocks = zip(*[sent for n, sent in enumerate(zip(corpus_trees, corpus_sents, corpus_blocks)) if len(sent[1]) <= trainmaxwords and n not in tepacocids][:trainnumsents]) getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, None, False, trees[0].label, None) del corpus_sents, corpus_taggedsents, corpus_trees, corpus_blocks results = {} cnt = 0 parser = Parser(stages, tailmarker=tailmarker, transformations=transformations) for cat, testset in sorted(testsets.items()): if cat == 'baseline': continue logging.info('category: %s', cat) begin = time.clock() results[cat] = doparsing(parser=parser, testset=testset, resultdir=resultdir, usetags=True, numproc=numproc, category=cat) cnt += len(testset[0]) if numproc == 1: logging.info('time elapsed during parsing: %g', time.clock() - begin) #else: # wall clock time here goldbrackets = multiset() totresults = [DictObj(name=stage.name) for stage in stages] for result in totresults: result.elapsedtime = [None] * cnt result.parsetrees = [None] * cnt result.brackets = multiset() result.exact = result.noparse = 0 goldblocks = [] goldsents = [] for cat, res in results.items(): logging.info('category: %s', cat) goldbrackets |= res[2] goldblocks.extend(res[3]) goldsents.extend(res[4]) for result, totresult in zip(res[0], totresults): totresult.exact += result.exact totresult.noparse += result.noparse totresult.brackets |= result.brackets totresult.elapsedtime.extend(result.elapsedtime) oldeval(*res) logging.info('TOTAL') oldeval(totresults, goldbrackets) # write TOTAL results file with all tepacoc sentences (not the baseline) for stage in stages: open('TOTAL.%s.export' % stage.name, 'w').writelines( open('%s.%s.export' % (cat, stage.name)).read() for cat in list(results) + ['gold']) # do baseline separately because it shouldn't count towards the total score cat = 'baseline' logging.info('category: %s', cat) oldeval(*doparsing(parser=parser, testset=testsets[cat], resultdir=resultdir, usetags=True, numproc=numproc, category=cat))
def startexp( stages=(DEFAULTSTAGE, ), # see above corpusfmt='export', # choices: export, discbracket, bracket corpusdir='.', # filenames may include globbing characters '*' and '?'. traincorpus='alpinosample.export', trainencoding='utf-8', testcorpus='alpinosample.export', testencoding='utf-8', testmaxwords=40, trainmaxwords=40, trainnumsents=2, testnumsents=1, # number of sentences to parse skiptrain=True, # test set starts after training set # (useful when they are in the same file) skip=0, # number of sentences to skip from test corpus punct=None, # choices: None, 'move', 'remove', 'root' functions=None, # choices None, 'add', 'remove', 'replace' morphology=None, # choices: None, 'add', 'replace', 'between' transformations=None, # apply treebank transformations # postagging: pass None to use tags from treebank. postagging=None, relationalrealizational=None, # do not apply RR-transform headrules=None, # rules for finding heads of constituents bintype='binarize', # choices: binarize, optimal, optimalhead factor='right', revmarkov=True, v=1, h=2, pospa=False, # when v > 1, add parent annotation to POS tags? markhead=False, # prepend head to siblings leftmostunary=True, # start binarization with unary node rightmostunary=True, # end binarization with unary node tailmarker='', # with headrules, head is last node and can be marked fanout_marks_before_bin=False, evalparam='proper.prm', # EVALB-style parameter file quiet=False, reallyquiet=False, # quiet=no per sentence results numproc=1, # increase to use multiple CPUs; None: use all CPUs. resultdir='results', rerun=False): """ Execute an experiment. """ assert bintype in ('optimal', 'optimalhead', 'binarize') if postagging is not None: assert set(postagging).issubset({'method', 'model', 'unknownthreshold', 'openclassthreshold', 'simplelexsmooth'}) if postagging['method'] == 'unknownword': assert postagging['model'] in ('4', '6', 'base') assert postagging['unknownthreshold'] >= 1 assert postagging['openclassthreshold'] >= 0 else: assert postagging['method'] in ('treetagger', 'stanford') if rerun: assert os.path.exists(resultdir), ( 'Directory %r does not exist.' '--rerun requires a directory ' 'with the grammar(s) of a previous experiment.' % resultdir) else: assert not os.path.exists(resultdir), ( 'Directory %r exists.\n' 'Use --rerun to parse with existing grammar ' 'and overwrite previous results.' % resultdir) os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' if reallyquiet: logging.basicConfig(level=logging.WARNING, format=formatstr) elif quiet: logging.basicConfig(level=logging.INFO, format=formatstr) else: logging.basicConfig(level=logging.DEBUG, format=formatstr) # also log to a file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) #fileobj.setLevel(logging.INFO) fileobj.setLevel(logging.DEBUG) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) corpusreader = getreader(corpusfmt) if not rerun: corpus = corpusreader(corpusdir, traincorpus, encoding=trainencoding, headrules=headrules, headfinal=True, headreverse=False, punct=punct, functions=functions, morphology=morphology) logging.info('%d sentences in training corpus %s/%s', len(corpus.parsed_sents()), corpusdir, traincorpus) if isinstance(trainnumsents, float): trainnumsents = int(trainnumsents * len(corpus.sents())) trees = list(corpus.parsed_sents().values())[:trainnumsents] sents = list(corpus.sents().values())[:trainnumsents] if transformations: trees = [transform(tree, sent, transformations) for tree, sent in zip(trees, sents)] if relationalrealizational: trees = [rrtransform(tree, **relationalrealizational)[0] for tree in trees] train_tagged_sents = [[(word, tag) for word, (_, tag) in zip(sent, sorted(tree.pos()))] for tree, sent in zip(trees, sents)] blocks = list(corpus.blocks().values())[:trainnumsents] assert trees, 'training corpus should be non-empty' logging.info('%d training sentences before length restriction', len(trees)) trees, sents, blocks = zip(*[sent for sent in zip(trees, sents, blocks) if len(sent[1]) <= trainmaxwords]) logging.info('%d training sentences after length restriction <= %d', len(trees), trainmaxwords) testset = corpusreader(corpusdir, testcorpus, encoding=testencoding, punct=punct, morphology=morphology, functions=functions) gold_sents = testset.tagged_sents() test_parsed_sents = testset.parsed_sents() if skiptrain: skip += trainnumsents logging.info('%d sentences in test corpus %s/%s', len(testset.parsed_sents()), corpusdir, testcorpus) logging.info('%d test sentences before length restriction', len(list(gold_sents)[skip:skip + testnumsents])) lexmodel = None test_tagged_sents = gold_sents if postagging and postagging['method'] in ('treetagger', 'stanford'): if postagging['method'] == 'treetagger': # these two tags are never given by tree-tagger, # so collect words whose tag needs to be overriden overridetags = ('PTKANT', 'PIDAT') elif postagging['method'] == 'stanford': overridetags = ('PTKANT', ) taglex = defaultdict(set) for sent in train_tagged_sents: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV'} sents_to_tag = OrderedDict((a, b) for a, b in islice(gold_sents.items(), skip, skip + testnumsents) if len(b) <= testmaxwords), test_tagged_sents = externaltagging(postagging['method'], postagging['model'], sents_to_tag, overridetagdict, tagmap) # give these tags to parser usetags = True elif postagging and postagging['method'] == 'unknownword' and not rerun: postagging['unknownwordfun'] = getunknownwordfun(postagging['model']) # get smoothed probalities for lexical productions lexresults, msg = getunknownwordmodel( train_tagged_sents, postagging['unknownwordfun'], postagging['unknownthreshold'], postagging['openclassthreshold']) logging.info(msg) simplelexsmooth = postagging['simplelexsmooth'] if simplelexsmooth: lexmodel = lexresults[2:8] else: lexmodel, msg = getlexmodel(*lexresults) logging.info(msg) # NB: knownwords are all words in training set, lexicon is the subset # of words that are above the frequency threshold. # for training purposes we work with the subset, at test time we exploit # the full set of known words from the training set. sigs, knownwords, lexicon = lexresults[:3] postagging['sigs'], postagging['lexicon'] = sigs, knownwords # replace rare train words with signatures sents = replaceraretrainwords(train_tagged_sents, postagging['unknownwordfun'], lexicon) # make sure gold POS tags are not given to parser usetags = False elif postagging and postagging['method'] == 'unknownword' and rerun: usetags = False else: simplelexsmooth = False # give gold POS tags to parser usetags = True # 0: test sentences as they should be handed to the parser, # 1: gold trees for evaluation purposes # 2: gold sentence because test sentences may be mangled by unknown word # model # 3: blocks from treebank file to reproduce the relevant part of the # original treebank verbatim. testset = OrderedDict((a, (test_tagged_sents[a], test_parsed_sents[a], gold_sents[a], block)) for a, block in islice(testset.blocks().items(), skip, skip + testnumsents) if len(test_tagged_sents[a]) <= testmaxwords) assert test_tagged_sents, 'test corpus should be non-empty' logging.info('%d test sentences after length restriction <= %d', len(testset), testmaxwords) if rerun: trees = [] sents = [] toplabels = {tree.label for tree in trees} | { test_parsed_sents[n].label for n in testset} assert len(toplabels) == 1, 'expected unique ROOT label: %r' % toplabels top = toplabels.pop() if rerun: readgrammars(resultdir, stages, postagging, top) else: logging.info('read training & test corpus') getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top, relationalrealizational) evalparam = evalmod.readparam(evalparam) evalparam['DEBUG'] = -1 evalparam['CUTOFF_LEN'] = 40 deletelabel = evalparam.get('DELETE_LABEL', ()) deleteword = evalparam.get('DELETE_WORD', ()) begin = time.clock() parser = Parser(stages, transformations=transformations, tailmarker=tailmarker, postagging=postagging if postagging and postagging['method'] == 'unknownword' else None, relationalrealizational=relationalrealizational) results = doparsing(parser=parser, testset=testset, resultdir=resultdir, usetags=usetags, numproc=numproc, deletelabel=deletelabel, deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology) if numproc == 1: logging.info('time elapsed during parsing: %gs', time.clock() - begin) for result in results[0]: nsent = len(result.parsetrees) header = (' ' + result.name.upper() + ' ').center(35, '=') evalsummary = evalmod.doeval(OrderedDict((a, b.copy(True)) for a, b in test_parsed_sents.items()), gold_sents, result.parsetrees, test_tagged_sents if usetags else gold_sents, evalparam) coverage = 'coverage: %s = %6.2f' % ( ('%d / %d' % (nsent - result.noparse, nsent)).rjust( 25 if any(len(a) > evalparam['CUTOFF_LEN'] for a in gold_sents.values()) else 14), 100.0 * (nsent - result.noparse) / nsent) logging.info('\n'.join(('', header, evalsummary, coverage))) return top