示例#1
0
def check(G, tokens, nltk=False):
    assert type(tokens) == list
    # convert list of tuples to list of strings
    if len(tokens) > 0:
        if type(tokens[0]) == tuple:
            tokens = [token[0] for token in tokens]
        assert type(tokens[0]) == str
    if not nltk:
        _, _, P, _ = G
        if len(P) == 0:
            return False
        grammar = convert2_nltk_CFG(G)
    else:
        grammar = G
    sr = ShiftReduceParser(grammar)
    #print(grammar.productions())

    # parse requires a series of tokens
    try:
        # this will raise an exception if fails
        # check if all tokens are terminals in grammar
        grammar.check_coverage(tokens)
    except Exception as e:
        return False

    # check that token sequence has some parse tree
    #print(list(sr.parse(tokens)))
    if len(list(sr.parse(tokens))) > 0:
        return True
    else:
        return False
示例#2
0
def parse_word_tag(word, tag, sentence):
    rule_perphrase_c = """S ->  DP | PP | AP | VP | CP | ADVP
            DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
            Dprime -> D | NP | D NP  | D CP 
            NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP | Nprime ADVP 
            Nprime -> N | N PP | PP N | N QP
            PP -> Pprime | Pprime ADVP | Pprime VP
            Pprime -> P | P DP
            AP -> Aprime | Aprime ADVP | Aprime AP | Aprime CP
            Aprime -> A | A DP 
            VP -> Vprime | Vprime ADVP | Vprime DP | Vprime CP 
            Vprime -> V | V DP | V PRN 
            CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime AP | Cprime QP | Cprime ADVP
            Cprime -> C | C Cprime
            QP ->  Qprime | Qprime CP
            Qprime -> Q | Q NP
            ADVP -> ADVprime | ADVprime QP | ADVprime DP  | ADVprime AP | ADVprime CP | ADVprime VP
            ADVprime -> ADV | ADV ADVP""" + '\n'

    rule_perphrase_b = """S ->  DP | PP | AP | VP | CP | ADV
            DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
            Dprime -> D | D NP | NP | D CP
            NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP 
            Nprime -> N | N PP | PP N 
            PP -> Pprime | Pprime ADV | Pprime VP
            Pprime -> P | P DP
            AP -> Aprime | Aprime ADV | Aprime AP | Aprime CP
            Aprime -> A | A DP 
            VP -> Vprime | Vprime ADV| Vprime DP | Vprime CP 
            Vprime -> V | V DP | V PRN 
            CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP | Cprime ADV
            Cprime -> C 
            QP ->  Qprime | Qprime CP
            Qprime -> Q""" + '\n'

    rule_perphrase_a = """S ->  DP | PP | AP | VP | CP | ADV
        DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
        Dprime -> D NP | NP | D CP
        NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP 
        Nprime -> N | N PP | PP N 
        PP -> Pprime | Pprime ADV | Pprime VP
        Pprime -> P | P DP
        AP -> Aprime | Aprime ADV
        Aprime -> A | A DP
        VP -> Vprime | Vprime ADV | Vprime DP 
        Vprime -> V | V DP | V PRN | Vprime CP 
        CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP
        Cprime -> C """ + '\n'

    rule_test_c = """S ->  DP Period | VP Period
    DP -> Dprime | Dprime QP | Dprime AP  | Dprime CP 
    Dprime -> D NP | NP | D CP
    NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP
    Nprime -> N | N PP | PP N 
    PP -> Pprime | Pprime ADV | Pprime VP
    Pprime -> P | P DP
    AP -> Aprime | Aprime ADV
    Aprime -> A | A DP
    VP -> Vprime | Vprime ADV | Vprime DP
    Vprime -> V | V DP | V PRN | Vprime CP 
    CP -> Cprime | Cprime VP | Cprime DP | Cprime NP
    Cprime -> C """ + '\n'


    rule_test = """S ->  DP Period | VP Period
    DP -> Dprime | Dprime QP | Dprime AP 
    Dprime -> D NP | NP
    NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N | N PP | PP N | N CP PP | PP CP N 
    PP -> Pprime | Pprime ADV | Pprime VP
    Pprime -> P | P DP
    AP -> Aprime | Aprime ADV
    Aprime -> A | A DP
    VP -> Vprime | Vprime ADV | Vprime DP
    Vprime -> V | V DP | V PRN | Vprime CP 
    CP -> Cprime | Cprime VP
    Cprime -> C | C VP | C NP """ + '\n'

    rule_test_b = """S -> DP VP 
    DP ->  Dprime QP | Dprime AP   
    Dprime -> D NP
    PP ->   Pprime ADV | Pprime VP 
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP ->  Vprime ADV | Vprime DP 
    Vprime -> V DP | V PRN | V CP 
    NP ->  Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N 
    CP -> Cprime VP 
    Cprime -> C VP | C NP """ + '\n'

    rule_abc = """S ->  DP Period 
    DP -> Dprime QP | Dprime AP 
    Dprime -> D NP
    NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N | N CP PP | PP CP N 
    PP -> Pprime ADV | Pprime VP
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP -> Vprime ADV | Vprime DP
    Vprime -> V DP | V PRN | Vprime CP 
    CP -> Cprime VP
    Cprime -> C VP | C NP """ + '\n'

    rule_test_b = """S -> DP VP 
    DP ->  Dprime QP | Dprime AP   
    Dprime -> D NP
    PP ->   Pprime ADV | Pprime VP 
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP ->  Vprime ADV | Vprime DP 
    Vprime -> V DP | V PRN | V CP 
    NP ->  Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N 
    CP -> Cprime VP 
    Cprime -> C VP | C NP """ + '\n'



    rule = """S ->  NP VP Sym | VP NP Sym |  VP Comma NP | NP Comma VP
    DP -> Dprime QP | Dprime AP 
    Dprime -> D NP
    PP -> Pprime ADV | Pprime TP
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP -> Vprime ADV | Vprime DP
    Vprime -> V DP | V PRN | Vprime CP | V comma DP | V comma PRN | comma Vprime CP
    NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N | N Comma PP | PP Comma N | N CP PP | PP CP N 
    TP -> Tprime DP | Tprime Q
    Tprime -> Tum VP | Tin VP
    Tprime -> Tma AP
    Tprime -> Tna- PP
    Tprime -> Tmay VP
    Tprime -> Ttaga VP
    CP -> Cprime TP
    Cprime -> C TP | C NP | comma C TP | comma C NP""" + '\n'

    rule_backup = """S ->  NP VP | VP NP
    DP -> Dprime QP | Dprime AP 
    Dprime -> D NP
    PP -> Pprime ADV | Pprime TP
    Pprime -> P DP
    AP -> Aprime ADV
    Aprime -> A DP
    VP -> Vprime ADV | Vprime DP
    Vprime -> V DP | V PRN | Vprime CP 
    NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP 
    Nprime -> N PP | PP N | N CP PP | PP CP N 
    TP -> Tprime DP | Tprime Q
    Tprime -> Tum VP | Tin VP
    Tprime -> Tma AP
    Tprime -> Tna- PP
    Tprime -> Tmay VP
    Tprime -> Ttaga VP
    CP -> Cprime TP
    Cprime -> C TP | C NP """ + '\n'

    i_tag = 0
    tag_rule = []
    sentence_word_tag = ''
    #print('tag length: ', len(tag))
    while i_tag < len(tag):
        if "NN" in tag[i_tag]:
            tag_rule.append('N')
        elif "PR" in tag[i_tag]:
            tag_rule.append('N')
        elif "DT" in tag[i_tag]:
            tag_rule.append('D')
        elif "LM" in tag[i_tag]:
            tag_rule.append('C')
        elif "CCU" in tag[i_tag]:
            tag_rule.append('P')
        elif "CC" in tag[i_tag]:
            tag_rule.append('C')
        elif "VB" in tag[i_tag]:
            tag_rule.append('V')
        elif "JJ" in tag[i_tag]:
            tag_rule.append('A')
        elif "RB" in tag[i_tag]:
            tag_rule.append('ADV')
        elif "CD" in tag[i_tag]:
            tag_rule.append('Q')
        elif "TS" in tag[i_tag]:
            tag_rule.append('D')
        elif "FW" in tag[i_tag]:
            tag_rule.append('N')
        elif "PMP" in tag[i_tag]:
            tag_rule.append('Period')
        elif "PMC" in tag[i_tag]:
            tag_rule.append('C')
        elif "PM" in tag[i_tag]:
            tag_rule.append('Sym')

        i_word = 0
        word_repeated = False
        while i_word < i_tag:
            if word[i_tag] == word[i_word]:
                word_repeated = True
            i_word += 1
        #print('i_tag: ', i_tag)
        if not word_repeated:
            sentence_word_tag += tag_rule[i_tag] + " -> " + "'" + word[i_tag] + "'" + '\n'
        i_tag += 1

    # DP = D' + QP | D' + AP
    # D' = D + NP
    #
    # PP = P' + ADV | P' + TP
    # P' = P + DP
    #
    # AP = A' + ADV
    # A' = A + DP
    #
    # VP = V' + ADV | V' + DP
    # V' = V + DP ¦ V + PRN ¦ V' + CP
    #
    # NP = N' + attribute phrase
    # N' = N + PP

    sentence_split = sentence.split()
    grammar = CFG.fromstring(rule_perphrase_c + sentence_word_tag)

    # #test uncomment to test english structure
    # grammar = CFG.fromstring("""
    # S -> NP VP
    # PP -> P NP
    # NP -> 'the' N | N PP | 'the' N PP
    # VP -> V NP | V PP | V NP PP
    # N -> 'cat'
    # N -> 'dog'
    # N -> 'rug'
    # V -> 'chased'
    # V -> 'sat'
    # P -> 'in'
    # P -> 'on'""")
    # sentence_split = 'the cat chased the dog on the rug'.split()

    rd = RecursiveDescentParser(grammar)
    sr = ShiftReduceParser(grammar)
    chart_parser = nltk.ChartParser(grammar)


    earley_chart_parser = nltk.EarleyChartParser(grammar)

    chart_parser = earley_chart_parser
    print(tag_rule)
    parse_tree = []
    print('Parse')
    for tree in chart_parser.parse(sentence_split):
        parse_tree.append(tree)

    if len(parse_tree) > 0:
        print(parse_tree[0])
    else:
        print('NO TREE')
示例#3
0
PP -> P NP

PropN -> 'Bill' | 'Bob' | 'He'
Det -> 'the' | 'a' | 'an' | 'An' | 'The' | 'A' | 'on'| 'some' 
N -> 'bear' | 'squirrel' | 'park' | 'block' | 'table' | 'river' | 'dog' | 'dogs'| 'pasta' | 'anchovies' | 'restaurant' | 'fork' 
Adj -> 'angry' | 'frightened' | 'furry' 
V -> 'chased' | 'saw' | 'eats' | 'eat' | 'chase' | 'Put' | 'have' 
P -> 'on' | 'in' | 'along' | 'with' 

""")

##sentence1 = "He eats pasta with a fork in the restaurant".split()
##parser1 = nltk.ChartParser(grammar)
##for tree1 in parser1.parse(sentence1):
##    # print(tree1)
##     print (tree1.draw())

sr = ShiftReduceParser(grammar)
sentence1 = "He eats pasta with some anchovies in the restaurant"
tokens = nltk.word_tokenize(sentence1)
for x in sr.parse(tokens):
    print(x.draw())

print("-------------------------------------------------------------------")

sentence1 = "He eats pasta with some anchovies in the restaurant".split()
parser1 = nltk.EarleyChartParser(grammar, trace=2)

for tree1 in parser1.parse(sentence1):
    print(tree1)
示例#4
0
P -> 'في'|'الى'|'من'|'عن'|'على'
V0 -> 'تفتح'|'فاض'|'ثار'|'هبت'|'جلس'|'ضاع'|'خرج'|'نام'|'وقعد'|'سافر'|'صدق'
V1 -> 'طوى'|'أكل'|'بلل'|'زرع'|'أطفأ'|'يركب'|'يستجيب'|'حفظ'|'كتب'|'شاهد'|'قال'
V2 -> 'يسقي'|'كسا'|'أعطى'|'ظن'|'حسب'|'جعل'|'خال'|'منح'|'منع'|'ألبس'
""")

#####RecursiveDescentParser######

tdParser = RecursiveDescentParser(grammar)
def rdp(s):
    for w in tdParser.parse(s.split()):
        print (w)

#####ShiftReduceParser#####

srPraser = ShiftReduceParser(grammar_reduced,2)
def srp(s):
    for w in srPraser.parse(s.split()):
        print (w)

#####LeftCornerParser#####

lcPraser = LeftCornerChartParser(grammar)
def lcp(s):
    for w in lcPraser.parse(s.split()):
        print (w)

#####EarleyParser#####

ePraser = EarleyChartParser(grammar)
def ep(s):
示例#5
0
def main(test=False):
    """
    makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and
    serializes them all to disk for future use.

    The ViterbiParser runs in cubic time and give the most likely parse.
    The ShiftReduceParser runs in linear time and gives a single parse.

    https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk
    https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc
    https://www.nltk.org/_modules/nltk/grammar.html
    """
    vocabulary = chainer.datasets.get_ptb_words_vocabulary()
    freq_thresh = 0 ## ARBITRARY
    word_freqs = FreqDist(ptb.words())

    if not os.path.isfile('parsers/grammar.pkl'):

        productions = []
        add_dict = {}

        # use the entire treebank's parsed sentences to generate the CFG
        for i, tree in enumerate(ptb.parsed_sents()):

            # is it a good idea to combine this with my preprocessing?
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            # preprocess all productions by removing all tags
            these_productions = tree.productions()
            for production in these_productions:

                # remove all tags from the LHS (only keep primary tag)
                production._lhs = preprocess_nt(production._lhs)

                rhs = []
                for item in production._rhs:

                    # remove all tags from the Nonterminals on the RHS
                    if type(item) == nltk.grammar.Nonterminal:
                        rhs.append(preprocess_nt(item))

                    # replace numbers with N
                    elif is_number(item):
                        rhs.append('N')

                    # items not in dictionary replaced with <unk>
                    # dictionary requires lower
                    elif not is_key(vocabulary, item.lower()):
                        rhs.append('<unk>')

                    # replace infrequent words with <unk>
                    elif word_freqs[item] < freq_thresh:
                        rhs.append('<unk>')

                    # lowercase all entries in the grammar
                    else:
                        rhs.append(item.lower())

                production._rhs = tuple(rhs)

                if not is_key(add_dict, production.unicode_repr()):
                    add_dict[production.unicode_repr()] = True
                    productions.append(production)

        print('** {} productions found! **'.format(len(productions)))
        grammar = induce_pcfg(Nonterminal('S'), productions)

        with open('parsers/grammar.pkl', 'wb') as f:
            f.write(pickle.dumps(grammar))

    if not os.path.isfile('parsers/viterbi_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time

        with open('parsers/viterbi_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(viterbi_parser))

    if not os.path.isfile('parsers/shift_reduce_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        shift_reduce_parser = ShiftReduceParser(grammar, trace=0)     # linear time

        with open('parsers/shift_reduce_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(shift_reduce_parser))

    with open('data/ptb.train.txt', 'r') as f:
        data = f.readlines()

    if test:
        for sample in [1, 23, 20330, 20332, 443]:

            t1 = time.time()
            viterbi_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('viterbi      = {:.2f} sec for {} words'.format(
                t2-t1, len(data[sample].split())))

            t1 = time.time()
            shift_reduce_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('shift reduce = {:.2f} sec for {} words'.format(
            t2-t1, len(data[sample].split())))
VP -> V NP | V 
Det -> 'The' | 'a' |'an'
N -> 'bear'|'squirrel'|'dog'
NP -> N
V -> 'eat' | 'eats' """)
cp = nltk.ChartParser(grammar)
sentence = [s.split() for s in ['The bear eat an squirrel', 'The dog eats']]
for s in sentence:
    for node in cp.parse(s):
        print(''.join(s))
        print(node)
        print(node.draw())
        print('\n------------\n')
from nltk.parse import ShiftReduceParser
grammar = CFG.fromstring("""
S -> NP VP 
NP -> Det N 
VP -> V NP | V 
Det -> 'The' | 'a' |'an'
N -> 'bear'|'squirrel'|'dog'
NP -> N
V -> 'eat' | 'eats' """)
#using Shift Reduce Parser
sr = ShiftReduceParser(grammar)
sentence = [s.split() for s in ['The bear eat an squirrel', 'The dog eats']]
for s in sentence:
    for node in sr.parse(s):
        print(''.join(s))
        print(node)
        print(node.draw())
        print('\n------------\n')