def demo(): """ A demonstration showing how PCFG C{Grammar}s can be created and used. """ from en.parser.nltk_lite.corpora import treebank, extract from en.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms from itertools import islice # Create some probabilistic CFG Productions S, A, B, C = cfg.nonterminals('S A B C') pcfg_prods = [ pcfg.Production(A, [B, B], prob=0.3), pcfg.Production(A, [C, B, C], prob=0.7), pcfg.Production(B, [B, 'b'], prob=0.5), pcfg.Production(B, [C], prob=0.5), pcfg.Production(C, ['a'], prob=0.1), pcfg.Production(C, ['b'], prob=0.9) ] pcfg_prod = pcfg_prods[2] print 'A PCFG production:', ` pcfg_prod ` print ' pcfg_prod.lhs() =>', ` pcfg_prod.lhs() ` print ' pcfg_prod.rhs() =>', ` pcfg_prod.rhs() ` print ' pcfg_prod.prob() =>', ` pcfg_prod.prob() ` print # Create and print a PCFG grammar = pcfg.Grammar(S, pcfg_prods) print 'A PCFG grammar:', ` grammar ` print ' grammar.start() =>', ` grammar.start() ` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26) print # extract productions from three trees and induce the PCFG print "Induce PCFG grammar from treebank data:" productions = [] for tree in islice(treebank.parsed(), 3): # perform optional in-place tree transformations, e.g.: # treetransforms.collapseUnary(tree, collapsePOS = False) # treetransforms.chomskyNormalForm(tree, horzMarkov = 2) productions += tree.productions() grammar = pcfg.induce(S, productions) print grammar print print "Parse sentence using induced grammar:" parser = pchart.InsideParse(grammar) parser.trace(3) sent = extract(0, treebank.raw()) print sent for parse in parser.get_parse_list(sent): print parse
pcfg.Production(S, [Paren], prob=0.2), pcfg.Production(S, [S, S], prob=0.1), pcfg.Production(Star, [S, '*'], prob=1), pcfg.Production(Plus, [S, '+'], prob=1), pcfg.Production(Qmk, [S, '?'], prob=1), pcfg.Production(Paren, ['(', S, ')'], prob=1) ] prob_term = 0.1 / len(terminals) # divide remaining pr. mass for terminal in terminals: rules.append(pcfg.Production(S, [terminal], prob=prob_term)) return pcfg.Grammar(S, rules) _parser = pchart.InsideParse(grammar('abcde')) # create NFA from regexp (Thompson's construction) # assumes unique start and final states def re2nfa(fsa, re): tokens = tokenize.regexp(re, pattern=r'.') tree = _parser.parse(tokens) if tree is None: raise ValueError('Bad Regexp') state = re2nfa_build(fsa, fsa.start(), tree) fsa.set_final([state]) # fsa.minimize()
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from en.parser.nltk_lite import tokenize from en.parser.nltk_lite.parse import cfg, pcfg, pchart # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw John with my cookie', pcfg.toy1), ('the boy saw Jack with Bob under the table with a telescope', pcfg.toy2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i + 1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = list(tokenize.whitespace(sent)) # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideParse(grammar), pchart.RandomParse(grammar), pchart.UnsortedParse(grammar), pchart.LongestParse(grammar), pchart.BeamParse(len(tokens) + 1, grammar) ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print '\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, pcfg) parser.trace(3) t = time.time() parses = parser.get_parse_list(tokens) times.append(time.time() - t) if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print ' Parser | Time (secs) # Parses Average P(parse)' print '-------------------+------------------------------------------' for i in range(len(parsers)): print '%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, times[i], num_parses[i], average_p[i]) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print '-------------------+------------------------------------------' print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from en.parser.nltk_lite.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse