Пример #1
0
def main(sentences, grammarfile, pcfg_grammar, algo, output, \
	 to_keeps, percent_discard, beam=0):

    grammar = nltk.data.load("file:%s" %(grammarfile))
    chart_parser = ChartParser(grammar,strategy=EARLEY_STRATEGY,trace=0)
    
    f = open(pcfg_grammar)
    pcfgrammar = f.read()
    f.close()

    if algo == "viterbi":
	pcfg_parser = nltk.ViterbiParser(nltk.parse_pcfg(pcfgrammar))
    elif algo == "inside":
	pcfg_parser = pchart.InsideChartParser(nltk.parse_pcfg(pcfgrammar),\
					       beam_size=beam)
    elif algo == "random":
	pcfg_parser = pchart.RandomChartParser(nltk.parse_pcfg(pcfgrammar),\
					       beam_size=beam)
    elif algo == "longest":
	pcfg_parser = pchart.LongestChartParser(nltk.parse_pcfg(pcfgrammar),\
						beam_size=beam)
    elif algo == "unsorted":
	pcfg_parser = pchart.UnsortedChartParser(nltk.parse_pcfg(pcfgrammar),\
						 beam_size=beam)	
    elif algo == "chart":
	pass
    else:
	print "unrecognized algorithm: %s" %(algo)
	return 1
	
    forest = []
    for sentence in sentences:
	parsed_sent = sentence.split()
	print "parsed_sent: %s" %(parsed_sent)
	start = datetime.now()

	if algo == "chart":
	    trees = chart_parser.nbest_parse(parsed_sent)
	else:
	    trees = pcfg_parser.nbest_parse(parsed_sent)
	    
	end = datetime.now()
	elapsed = end - start
	print "parsing time elapsed: %s" %(elapsed)
	print "parsing time elapsed: %d us" %(elapsed.microseconds)

	if (len(trees) == 0):
	    print "failed to parse: %s" %(sentence)
	    return 1;
	forest.append(trees)

    all_productions = grammar.productions()
    # randomly shuffle the productions
    all_productions = all_productions[0:len(all_productions)]
    random.shuffle(all_productions)
    random.shuffle(all_productions)

    status = 0
    for keep in to_keeps:
	for discard in percent_discard:
	    status += create_pruned_grammar(forest, all_productions, keep,\
					    discard, output)
    return status
Пример #2
0
def demo(choice=None, draw_parses=None, print_parses=None):
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    if choice is None:
        # Ask the user which demo they want to use.
        print()
        for i in range(len(demos)):
            print('%3s: %s' % (i + 1, demos[i][0]))
            print('     %r' % demos[i][1])
            print()
        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
        choice = int(sys.stdin.readline().strip()) - 1
    try:
        sent, grammar = demos[choice]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar,
                                 beam_size=len(tokens) + 1)  # was BeamParser
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
        parser.trace(3)
        t = time.time()
        parses = list(parser.parse(tokens))
        times.append(time.time() - t)
        p = (reduce(lambda a, b: a + b.prob(), parses, 0) /
             len(parses) if parses else 0)
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print(
        '       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
    print(
        '------------------------+------------------------------------------')
    for i in range(len(parsers)):
        print('%18s %4d |%11.4f%11d%19.14f' %
              (parsers[i].__class__.__name__, parsers[i].beam_size, times[i],
               num_parses[i], average_p[i]))
    parses = all_parses.keys()
    if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else: p = 0
    print(
        '------------------------+------------------------------------------')
    print('%18s      |%11s%11d%19.14f' %
          ('(All Parses)', 'n/a', len(parses), p))

    if draw_parses is None:
        # Ask the user if we should draw the parses.
        print()
        print('Draw parses (y/n)? ', end=' ')
        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
    if draw_parses:
        from nltk.draw.tree import draw_trees
        print('  please wait...')
        draw_trees(*parses)

    if print_parses is None:
        # Ask the user if we should print the parses.
        print()
        print('Print parses (y/n)? ', end=' ')
        print_parses = sys.stdin.readline().strip().lower().startswith('y')
    if print_parses:
        for parse in parses:
            print(parse)
Пример #3
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize, toy_pcfg1, toy_pcfg2
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print '%3s: %s' % (i + 1, demos[i][0])
        print '     %r' % demos[i][1]
        print
    print 'Which demo (%d-%d)? ' % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print 'Bad sentence number'
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar,
                                 beam_size=len(tokens) + 1)  # was BeamParser
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print '\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)
        parser.trace(3)
        t = time.time()
        parses = parser.nbest_parse(tokens)
        times.append(time.time() - t)
        if parses:
            p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
        else:
            p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print '       Parser      Beam | Time (secs)   # Parses   Average P(parse)'
    print '------------------------+------------------------------------------'
    for i in range(len(parsers)):
        print '%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                               parsers[i].beam_size, times[i],
                                               num_parses[i], average_p[i])
    parses = all_parses.keys()
    if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else: p = 0
    print '------------------------+------------------------------------------'
    print '%18s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses),
                                          p)

    # Ask the user if we should draw the parses.
    print
    print 'Draw parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nltk.draw.tree import draw_trees
        print '  please wait...'
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print 'Print parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print parse