def loadparsers(): """Load grammars if necessary.""" if not PARSERS: for directory in glob.glob('grammars/*/'): _, lang = os.path.split(os.path.dirname(directory)) APP.logger.info('Loading grammar %r', lang) params = readparam(os.path.join(directory, 'params.prm')) params.resultdir = directory readgrammars(directory, params.stages, params.postagging, top=getattr(params, 'top', 'ROOT')) PARSERS[lang] = Parser(params) APP.logger.info('Grammar for %s loaded.', lang) assert PARSERS, 'no grammars found!'
def loadparsers(): """Load grammars if necessary.""" if not PARSERS: for directory in glob.glob('grammars/*/'): _, lang = os.path.split(os.path.dirname(directory)) APP.logger.info('Loading grammar %r', lang) params = readparam(os.path.join(directory, 'params.prm')) params.resultdir = directory readgrammars(directory, params.stages, params.postagging, params.transformations, top=getattr(params, 'top', 'ROOT')) PARSERS[lang] = Parser(params) APP.logger.info('Grammar for %s loaded.', lang) assert PARSERS, 'no grammars found!'
def loadgrammar(directory, limit): """Load grammar""" global PARSER, LIMIT params = readparam(os.path.join(directory, 'params.prm')) params.resultdir = directory readgrammars(directory, params.stages, params.postagging, params.transformations, top=getattr(params, 'top', 'ROOT'), cache=True) PARSER = Parser(params, loadtrees=True) LIMIT = limit print('phrasal labels', PARSER.phrasallabels) print('pos tags', PARSER.poslabels) print('function tags', PARSER.functiontags) print('morph tags', PARSER.morphtags)
def loadparsers(): """ Load grammars if necessary. """ if not PARSERS: for directory in glob.glob('grammars/*/'): _, lang = os.path.split(os.path.dirname(directory)) APP.logger.info('Loading grammar %r', lang) params = readparam(os.path.join(directory, 'params.prm')) params['resultdir'] = directory stages = params['stages'] postagging = params['postagging'] readgrammars(directory, stages, postagging, top=params.get('top', 'ROOT')) PARSERS[lang] = Parser(stages, transformations=params.get('transformations'), tailmarker=params.get('tailmarker'), postagging=postagging if postagging and postagging['method'] == 'unknownword' else None, relationalrealizational=params.get( 'relationalrealizational')) APP.logger.info('Grammar for %s loaded.' % lang) assert PARSERS, 'no grammars found!'
def startexp( stages=(DEFAULTSTAGE, ), # see above corpusfmt='export', # choices: export, discbracket, bracket corpusdir='.', # filenames may include globbing characters '*' and '?'. traincorpus='alpinosample.export', trainencoding='utf-8', testcorpus='alpinosample.export', testencoding='utf-8', testmaxwords=40, trainmaxwords=40, trainnumsents=2, testnumsents=1, # number of sentences to parse skiptrain=True, # test set starts after training set # (useful when they are in the same file) skip=0, # number of sentences to skip from test corpus punct=None, # choices: None, 'move', 'remove', 'root' functions=None, # choices None, 'add', 'remove', 'replace' morphology=None, # choices: None, 'add', 'replace', 'between' transformations=None, # apply treebank transformations # postagging: pass None to use tags from treebank. postagging=None, relationalrealizational=None, # do not apply RR-transform headrules=None, # rules for finding heads of constituents bintype='binarize', # choices: binarize, optimal, optimalhead factor='right', revmarkov=True, v=1, h=2, pospa=False, # when v > 1, add parent annotation to POS tags? markhead=False, # prepend head to siblings leftmostunary=True, # start binarization with unary node rightmostunary=True, # end binarization with unary node tailmarker='', # with headrules, head is last node and can be marked fanout_marks_before_bin=False, evalparam='proper.prm', # EVALB-style parameter file quiet=False, reallyquiet=False, # quiet=no per sentence results numproc=1, # increase to use multiple CPUs; None: use all CPUs. resultdir='results', rerun=False): """ Execute an experiment. """ assert bintype in ('optimal', 'optimalhead', 'binarize') if postagging is not None: assert set(postagging).issubset({'method', 'model', 'unknownthreshold', 'openclassthreshold', 'simplelexsmooth'}) if postagging['method'] == 'unknownword': assert postagging['model'] in ('4', '6', 'base') assert postagging['unknownthreshold'] >= 1 assert postagging['openclassthreshold'] >= 0 else: assert postagging['method'] in ('treetagger', 'stanford') if rerun: assert os.path.exists(resultdir), ( 'Directory %r does not exist.' '--rerun requires a directory ' 'with the grammar(s) of a previous experiment.' % resultdir) else: assert not os.path.exists(resultdir), ( 'Directory %r exists.\n' 'Use --rerun to parse with existing grammar ' 'and overwrite previous results.' % resultdir) os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' if reallyquiet: logging.basicConfig(level=logging.WARNING, format=formatstr) elif quiet: logging.basicConfig(level=logging.INFO, format=formatstr) else: logging.basicConfig(level=logging.DEBUG, format=formatstr) # also log to a file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) #fileobj.setLevel(logging.INFO) fileobj.setLevel(logging.DEBUG) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) corpusreader = getreader(corpusfmt) if not rerun: corpus = corpusreader(corpusdir, traincorpus, encoding=trainencoding, headrules=headrules, headfinal=True, headreverse=False, punct=punct, functions=functions, morphology=morphology) logging.info('%d sentences in training corpus %s/%s', len(corpus.parsed_sents()), corpusdir, traincorpus) if isinstance(trainnumsents, float): trainnumsents = int(trainnumsents * len(corpus.sents())) trees = list(corpus.parsed_sents().values())[:trainnumsents] sents = list(corpus.sents().values())[:trainnumsents] if transformations: trees = [transform(tree, sent, transformations) for tree, sent in zip(trees, sents)] if relationalrealizational: trees = [rrtransform(tree, **relationalrealizational)[0] for tree in trees] train_tagged_sents = [[(word, tag) for word, (_, tag) in zip(sent, sorted(tree.pos()))] for tree, sent in zip(trees, sents)] blocks = list(corpus.blocks().values())[:trainnumsents] assert trees, 'training corpus should be non-empty' logging.info('%d training sentences before length restriction', len(trees)) trees, sents, blocks = zip(*[sent for sent in zip(trees, sents, blocks) if len(sent[1]) <= trainmaxwords]) logging.info('%d training sentences after length restriction <= %d', len(trees), trainmaxwords) testset = corpusreader(corpusdir, testcorpus, encoding=testencoding, punct=punct, morphology=morphology, functions=functions) gold_sents = testset.tagged_sents() test_parsed_sents = testset.parsed_sents() if skiptrain: skip += trainnumsents logging.info('%d sentences in test corpus %s/%s', len(testset.parsed_sents()), corpusdir, testcorpus) logging.info('%d test sentences before length restriction', len(list(gold_sents)[skip:skip + testnumsents])) lexmodel = None test_tagged_sents = gold_sents if postagging and postagging['method'] in ('treetagger', 'stanford'): if postagging['method'] == 'treetagger': # these two tags are never given by tree-tagger, # so collect words whose tag needs to be overriden overridetags = ('PTKANT', 'PIDAT') elif postagging['method'] == 'stanford': overridetags = ('PTKANT', ) taglex = defaultdict(set) for sent in train_tagged_sents: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV'} sents_to_tag = OrderedDict((a, b) for a, b in islice(gold_sents.items(), skip, skip + testnumsents) if len(b) <= testmaxwords), test_tagged_sents = externaltagging(postagging['method'], postagging['model'], sents_to_tag, overridetagdict, tagmap) # give these tags to parser usetags = True elif postagging and postagging['method'] == 'unknownword' and not rerun: postagging['unknownwordfun'] = getunknownwordfun(postagging['model']) # get smoothed probalities for lexical productions lexresults, msg = getunknownwordmodel( train_tagged_sents, postagging['unknownwordfun'], postagging['unknownthreshold'], postagging['openclassthreshold']) logging.info(msg) simplelexsmooth = postagging['simplelexsmooth'] if simplelexsmooth: lexmodel = lexresults[2:8] else: lexmodel, msg = getlexmodel(*lexresults) logging.info(msg) # NB: knownwords are all words in training set, lexicon is the subset # of words that are above the frequency threshold. # for training purposes we work with the subset, at test time we exploit # the full set of known words from the training set. sigs, knownwords, lexicon = lexresults[:3] postagging['sigs'], postagging['lexicon'] = sigs, knownwords # replace rare train words with signatures sents = replaceraretrainwords(train_tagged_sents, postagging['unknownwordfun'], lexicon) # make sure gold POS tags are not given to parser usetags = False elif postagging and postagging['method'] == 'unknownword' and rerun: usetags = False else: simplelexsmooth = False # give gold POS tags to parser usetags = True # 0: test sentences as they should be handed to the parser, # 1: gold trees for evaluation purposes # 2: gold sentence because test sentences may be mangled by unknown word # model # 3: blocks from treebank file to reproduce the relevant part of the # original treebank verbatim. testset = OrderedDict((a, (test_tagged_sents[a], test_parsed_sents[a], gold_sents[a], block)) for a, block in islice(testset.blocks().items(), skip, skip + testnumsents) if len(test_tagged_sents[a]) <= testmaxwords) assert test_tagged_sents, 'test corpus should be non-empty' logging.info('%d test sentences after length restriction <= %d', len(testset), testmaxwords) if rerun: trees = [] sents = [] toplabels = {tree.label for tree in trees} | { test_parsed_sents[n].label for n in testset} assert len(toplabels) == 1, 'expected unique ROOT label: %r' % toplabels top = toplabels.pop() if rerun: readgrammars(resultdir, stages, postagging, top) else: logging.info('read training & test corpus') getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top, relationalrealizational) evalparam = evalmod.readparam(evalparam) evalparam['DEBUG'] = -1 evalparam['CUTOFF_LEN'] = 40 deletelabel = evalparam.get('DELETE_LABEL', ()) deleteword = evalparam.get('DELETE_WORD', ()) begin = time.clock() parser = Parser(stages, transformations=transformations, tailmarker=tailmarker, postagging=postagging if postagging and postagging['method'] == 'unknownword' else None, relationalrealizational=relationalrealizational) results = doparsing(parser=parser, testset=testset, resultdir=resultdir, usetags=usetags, numproc=numproc, deletelabel=deletelabel, deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology) if numproc == 1: logging.info('time elapsed during parsing: %gs', time.clock() - begin) for result in results[0]: nsent = len(result.parsetrees) header = (' ' + result.name.upper() + ' ').center(35, '=') evalsummary = evalmod.doeval(OrderedDict((a, b.copy(True)) for a, b in test_parsed_sents.items()), gold_sents, result.parsetrees, test_tagged_sents if usetags else gold_sents, evalparam) coverage = 'coverage: %s = %6.2f' % ( ('%d / %d' % (nsent - result.noparse, nsent)).rjust( 25 if any(len(a) > evalparam['CUTOFF_LEN'] for a in gold_sents.values()) else 14), 100.0 * (nsent - result.noparse) / nsent) logging.info('\n'.join(('', header, evalsummary, coverage))) return top
def startexp( stages=(parser.DictObj(parser.DEFAULTSTAGE), ), # see parser module corpusfmt='export', # choices: export, (disc)bracket, alpino, tiger traincorpus=parser.DictObj(DEFAULTS['traincorpus']), testcorpus=parser.DictObj(DEFAULTS['testcorpus']), binarization=parser.DictObj(DEFAULTS['binarization']), removeempty=False, # whether to remove empty terminals ensureroot=None, # ensure every tree has a root node with this label punct=None, # choices: None, 'move', 'remove', 'root' functions=None, # choices None, 'add', 'remove', 'replace' morphology=None, # choices: None, 'add', 'replace', 'between' transformations=None, # apply treebank transformations postagging=None, # postagging: pass None to use tags from treebank. relationalrealizational=None, # do not apply RR-transform evalparam='proper.prm', # EVALB-style parameter file verbosity=2, numproc=1, # increase to use multiple CPUs; None: use all CPUs. resultdir='results', rerun=False): """Execute an experiment.""" if rerun: if not os.path.exists(resultdir): raise ValueError('Directory %r does not exist.\n--rerun requires a' ' directory with the grammar(s) of a previous experiment.' % resultdir) else: if os.path.exists(resultdir): raise ValueError('Directory %r exists.\n' 'Use --rerun to parse with existing grammar ' 'and overwrite previous results.' % resultdir) os.mkdir(resultdir) # Log everything, and send it to stderr, in a format with just the message. formatstr = '%(message)s' if verbosity == 0: logging.basicConfig(level=logging.WARNING, format=formatstr) elif verbosity == 1: logging.basicConfig(level=logging.INFO, format=formatstr) elif verbosity == 2: logging.basicConfig(level=logging.DEBUG, format=formatstr) elif 3 <= verbosity <= 4: logging.basicConfig(level=5, format=formatstr) else: raise ValueError('verbosity should be >= 0 and <= 4. ') # also log to a file fileobj = logging.FileHandler(filename='%s/output.log' % resultdir) fileobj.setLevel(logging.DEBUG) fileobj.setFormatter(logging.Formatter(formatstr)) logging.getLogger('').addHandler(fileobj) if not rerun: trees, sents, train_tagged_sents = loadtraincorpus( corpusfmt, traincorpus, binarization, punct, functions, morphology, removeempty, ensureroot, transformations, relationalrealizational) elif isinstance(traincorpus.numsents, float): raise ValueError('need to specify number of training set sentences, ' 'not fraction, in rerun mode.') testsettb = treebank.READERS[corpusfmt]( testcorpus.path, encoding=testcorpus.encoding, removeempty=removeempty, morphology=morphology, functions=functions, ensureroot=ensureroot) if isinstance(testcorpus.numsents, float): testcorpus.numsents = int(testcorpus.numsents * len(testsettb.blocks())) if testcorpus.skiptrain: testcorpus.skip += ( # pylint: disable=maybe-no-member traincorpus.numsents) # pylint: disable=maybe-no-member test_blocks = OrderedDict() test_trees = OrderedDict() test_tagged_sents = OrderedDict() for n, a in islice(testsettb._read_blocks(), testcorpus.skip, testcorpus.skip # pylint: disable=maybe-no-member + testcorpus.numsents): tree, sent = testsettb._parsetree(a) if 1 <= len(sent) <= testcorpus.maxwords: test_blocks[n] = testsettb._strblock(n, a) test_trees[n] = tree test_tagged_sents[n] = [(word, tag) for word, (_, tag) in zip(sent, sorted(tree.pos()))] logging.info('%d test sentences after length restriction <= %d', len(test_trees), testcorpus.maxwords) lexmodel = None simplelexsmooth = False test_tagged_sents_mangled = test_tagged_sents if postagging and postagging.method in ('treetagger', 'stanford', 'frog'): if postagging.method == 'treetagger': # these two tags are never given by tree-tagger, # so collect words whose tag needs to be overriden overridetags = ('PTKANT', 'PIDAT') elif postagging.method == 'stanford': overridetags = ('PTKANT', ) elif postagging.method == 'frog': overridetags = () taglex = defaultdict(set) for sent in train_tagged_sents: for word, tag in sent: taglex[word].add(tag) overridetagdict = {tag: {word for word, tags in taglex.items() if tags == {tag}} for tag in overridetags} tagmap = {'$(': '$[', 'PAV': 'PROAV'} test_tagged_sents_mangled = lexicon.externaltagging(postagging.method, postagging.model, test_tagged_sents, overridetagdict, tagmap) if postagging.retag and not rerun: logging.info('re-tagging training corpus') sents_to_tag = OrderedDict(enumerate(train_tagged_sents)) train_tagged_sents = lexicon.externaltagging(postagging.method, postagging.model, sents_to_tag, overridetagdict, tagmap).values() for tree, tagged in zip(trees, train_tagged_sents): for node in tree.subtrees( lambda n: len(n) == 1 and isinstance(n[0], int)): node.label = tagged[node[0]][1] usetags = True # give these tags to parser elif postagging and postagging.method == 'unknownword': if not rerun: sents, lexmodel = getposmodel(postagging, train_tagged_sents) simplelexsmooth = postagging.simplelexsmooth usetags = False # make sure gold POS tags are not given to parser else: usetags = True # give gold POS tags to parser # 0: test sentences as they should be handed to the parser, # 1: gold trees for evaluation purposes # 2: gold sents because test sentences may be mangled by unknown word model # 3: blocks from treebank file to reproduce the relevant part of the # original treebank verbatim. testset = OrderedDict((n, ( test_tagged_sents_mangled[n], test_trees[n], test_tagged_sents[n], block)) for n, block in test_blocks.items()) if not test_tagged_sents: raise ValueError('test corpus (selection) should be non-empty.') if rerun: trees, sents = [], [] roots = {t.label for t in trees} | {test_trees[n].label for n in testset} if len(roots) != 1: raise ValueError('expected unique ROOT label: %r' % roots) top = roots.pop() if rerun: parser.readgrammars(resultdir, stages, postagging, top) else: logging.info('read training & test corpus') getgrammars(dobinarization(trees, sents, binarization, relationalrealizational), sents, stages, testcorpus.maxwords, resultdir, numproc, lexmodel, simplelexsmooth, top) evalparam = evalmod.readparam(evalparam) evalparam['DEBUG'] = -1 evalparam['CUTOFF_LEN'] = 40 deletelabel = evalparam.get('DELETE_LABEL', ()) deleteword = evalparam.get('DELETE_WORD', ()) begin = time.clock() theparser = parser.Parser(stages, transformations=transformations, binarization=binarization, postagging=postagging if postagging and postagging.method == 'unknownword' else None, relationalrealizational=relationalrealizational, verbosity=verbosity) results = doparsing(parser=theparser, testset=testset, resultdir=resultdir, usetags=usetags, numproc=numproc, deletelabel=deletelabel, deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology, evalparam=evalparam) if numproc == 1: logging.info('time elapsed during parsing: %gs', time.clock() - begin) for result in results: nsent = len(result.parsetrees) overcutoff = any(len(a) > evalparam['CUTOFF_LEN'] for a in test_tagged_sents.values()) header = (' ' + result.name.upper() + ' ').center( 44 if overcutoff else 35, '=') evalsummary = result.evaluator.summary() coverage = 'coverage: %s = %6.2f' % ( ('%d / %d' % (nsent - result.noparse, nsent)).rjust( 25 if overcutoff else 14), 100.0 * (nsent - result.noparse) / nsent) logging.info('\n'.join(('', header, evalsummary, coverage))) return top