def main(sentences, grammarfile, pcfg_grammar, algo, output, \ to_keeps, percent_discard, beam=0): grammar = nltk.data.load("file:%s" %(grammarfile)) chart_parser = ChartParser(grammar,strategy=EARLEY_STRATEGY,trace=0) f = open(pcfg_grammar) pcfgrammar = f.read() f.close() if algo == "viterbi": pcfg_parser = nltk.ViterbiParser(nltk.parse_pcfg(pcfgrammar)) elif algo == "inside": pcfg_parser = pchart.InsideChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "random": pcfg_parser = pchart.RandomChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "longest": pcfg_parser = pchart.LongestChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "unsorted": pcfg_parser = pchart.UnsortedChartParser(nltk.parse_pcfg(pcfgrammar),\ beam_size=beam) elif algo == "chart": pass else: print "unrecognized algorithm: %s" %(algo) return 1 forest = [] for sentence in sentences: parsed_sent = sentence.split() print "parsed_sent: %s" %(parsed_sent) start = datetime.now() if algo == "chart": trees = chart_parser.nbest_parse(parsed_sent) else: trees = pcfg_parser.nbest_parse(parsed_sent) end = datetime.now() elapsed = end - start print "parsing time elapsed: %s" %(elapsed) print "parsing time elapsed: %d us" %(elapsed.microseconds) if (len(trees) == 0): print "failed to parse: %s" %(sentence) return 1; forest.append(trees) all_productions = grammar.productions() # randomly shuffle the productions all_productions = all_productions[0:len(all_productions)] random.shuffle(all_productions) random.shuffle(all_productions) status = 0 for keep in to_keeps: for discard in percent_discard: status += create_pruned_grammar(forest, all_productions, keep,\ discard, output) return status
def demo(choice=None, draw_parses=None, print_parses=None): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import pchart # Define two demos. Each demo has a sentence and a grammar. toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) demos = [('I saw John with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)] if choice is None: # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i + 1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') choice = int(sys.stdin.readline().strip()) - 1 try: sent, grammar = demos[choice] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideChartParser(grammar), pchart.RandomChartParser(grammar), pchart.UnsortedChartParser(grammar), pchart.LongestChartParser(grammar), pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1) # was BeamParser ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)) parser.trace(3) t = time.time() parses = list(parser.parse(tokens)) times.append(time.time() - t) p = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print( ' Parser Beam | Time (secs) # Parses Average P(parse)') print( '------------------------+------------------------------------------') for i in range(len(parsers)): print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, parsers[i].beam_size, times[i], num_parses[i], average_p[i])) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print( '------------------------+------------------------------------------') print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)) if draw_parses is None: # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') draw_parses = sys.stdin.readline().strip().lower().startswith('y') if draw_parses: from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) if print_parses is None: # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') print_parses = sys.stdin.readline().strip().lower().startswith('y') if print_parses: for parse in parses: print(parse)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize, toy_pcfg1, toy_pcfg2 from nltk.parse import pchart # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw John with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i + 1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = sent.split() # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideChartParser(grammar), pchart.RandomChartParser(grammar), pchart.UnsortedChartParser(grammar), pchart.LongestChartParser(grammar), pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1) # was BeamParser ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print '\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar) parser.trace(3) t = time.time() parses = parser.nbest_parse(tokens) times.append(time.time() - t) if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print ' Parser Beam | Time (secs) # Parses Average P(parse)' print '------------------------+------------------------------------------' for i in range(len(parsers)): print '%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, parsers[i].beam_size, times[i], num_parses[i], average_p[i]) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print '------------------------+------------------------------------------' print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse