Exemplo n.º 1
0
def demo():
    """
    A demonstration showing how PCFG C{Grammar}s can be created and used.
    """

    from nodebox_linguistics_extended.parser.nltk_lite.corpora import treebank, extract
    from nodebox_linguistics_extended.parser.nltk_lite.parse import cfg, pcfg, pchart, treetransforms
    from itertools import islice

    # Create some probabilistic CFG Productions
    S, A, B, C = cfg.nonterminals('S A B C')
    pcfg_prods = [
        pcfg.Production(A, [B, B], prob=0.3),
        pcfg.Production(A, [C, B, C], prob=0.7),
        pcfg.Production(B, [B, 'b'], prob=0.5),
        pcfg.Production(B, [C], prob=0.5),
        pcfg.Production(C, ['a'], prob=0.1),
        pcfg.Production(C, ['b'], prob=0.9)
    ]

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    # Create and print a PCFG
    grammar = pcfg.Grammar(S, pcfg_prods)
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use string.replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for tree in islice(treebank.parsed(), 3):
        # perform optional in-place tree transformations, e.g.:
        # treetransforms.collapseUnary(tree, collapsePOS = False)
        # treetransforms.chomskyNormalForm(tree, horzMarkov = 2)

        productions += tree.productions()

    grammar = pcfg.induce(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideParse(grammar)
    parser.trace(3)

    sent = extract(0, treebank.raw())
    print(sent)
    for parse in parser.get_parse_list(sent):
        print(parse)
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nodebox_linguistics_extended.parser.nltk_lite import tokenize
    from nodebox_linguistics_extended.parser.nltk_lite.parse import cfg, pcfg, pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my cookie', pcfg.toy1),
             ('the boy saw Jack with Bob under the table with a telescope',
              pcfg.toy2)]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print('%3s: %s' % (i+1, demos[i][0]))
        print('     %r' % demos[i][1])
        print()
    print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
    try:
        snum = int(sys.stdin.readline().strip())-1
        sent, grammar = demos[snum]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = list(tokenize.whitespace(sent))

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideParse(grammar),
        pchart.RandomParse(grammar),
        pchart.UnsortedParse(grammar),
        pchart.LongestParse(grammar),
        pchart.BeamParse(len(tokens)+1, grammar)
        ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,pcfg))
        parser.trace(3)
        t = time.time()
        parses = parser.get_parse_list(tokens)
        times.append(time.time()-t)
        if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
        else: p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses: all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print('       Parser      | Time (secs)   # Parses   Average P(parse)')
    print('-------------------+------------------------------------------')
    for i in range(len(parsers)):
        print('%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                         times[i],num_parses[i],average_p[i]))
    parses = list(all_parses.keys())
    if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
    else: p = 0
    print('-------------------+------------------------------------------')
    print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print('Draw parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nodebox_linguistics_extended.parser.nltk_lite.draw.tree import draw_trees
        print('  please wait...')
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print('Print parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print(parse)
Exemplo n.º 3
0
             pcfg.Production(S, [Plus], prob=0.2),
             pcfg.Production(S, [Qmk], prob=0.2),
             pcfg.Production(S, [Paren], prob=0.2),
             pcfg.Production(S, [S, S], prob=0.1),
             pcfg.Production(Star, [S, '*'], prob=1),
             pcfg.Production(Plus, [S, '+'], prob=1),
             pcfg.Production(Qmk, [S, '?'], prob=1),
             pcfg.Production(Paren, ['(', S, ')'], prob=1)]

    prob_term = 0.1/len(terminals) # divide remaining pr. mass
    for terminal in terminals:
        rules.append(pcfg.Production(S, [terminal], prob=prob_term))

    return pcfg.Grammar(S, rules)

_parser = pchart.InsideParse(grammar('abcde'))

# create NFA from regexp (Thompson's construction)
# assumes unique start and final states

def re2nfa(fsa, re):
    tokens = tokenize.regexp(re, pattern=r'.')
    tree = _parser.parse(tokens)
    if tree is None: raise ValueError('Bad Regexp')
    state = re2nfa_build(fsa, fsa.start(), tree)
    fsa.set_final([state])
#        fsa.minimize()

def re2nfa_build(fsa, node, tree):
    # Terminals.
    if not isinstance(tree, Tree):