def check(G, tokens, nltk=False): assert type(tokens) == list # convert list of tuples to list of strings if len(tokens) > 0: if type(tokens[0]) == tuple: tokens = [token[0] for token in tokens] assert type(tokens[0]) == str if not nltk: _, _, P, _ = G if len(P) == 0: return False grammar = convert2_nltk_CFG(G) else: grammar = G sr = ShiftReduceParser(grammar) #print(grammar.productions()) # parse requires a series of tokens try: # this will raise an exception if fails # check if all tokens are terminals in grammar grammar.check_coverage(tokens) except Exception as e: return False # check that token sequence has some parse tree #print(list(sr.parse(tokens))) if len(list(sr.parse(tokens))) > 0: return True else: return False
def parse_word_tag(word, tag, sentence): rule_perphrase_c = """S -> DP | PP | AP | VP | CP | ADVP DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D | NP | D NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP | Nprime ADVP Nprime -> N | N PP | PP N | N QP PP -> Pprime | Pprime ADVP | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADVP | Aprime AP | Aprime CP Aprime -> A | A DP VP -> Vprime | Vprime ADVP | Vprime DP | Vprime CP Vprime -> V | V DP | V PRN CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime AP | Cprime QP | Cprime ADVP Cprime -> C | C Cprime QP -> Qprime | Qprime CP Qprime -> Q | Q NP ADVP -> ADVprime | ADVprime QP | ADVprime DP | ADVprime AP | ADVprime CP | ADVprime VP ADVprime -> ADV | ADV ADVP""" + '\n' rule_perphrase_b = """S -> DP | PP | AP | VP | CP | ADV DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D | D NP | NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP Nprime -> N | N PP | PP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV | Aprime AP | Aprime CP Aprime -> A | A DP VP -> Vprime | Vprime ADV| Vprime DP | Vprime CP Vprime -> V | V DP | V PRN CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP | Cprime ADV Cprime -> C QP -> Qprime | Qprime CP Qprime -> Q""" + '\n' rule_perphrase_a = """S -> DP | PP | AP | VP | CP | ADV DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D NP | NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP Nprime -> N | N PP | PP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV Aprime -> A | A DP VP -> Vprime | Vprime ADV | Vprime DP Vprime -> V | V DP | V PRN | Vprime CP CP -> Cprime | Cprime VP | Cprime DP | Cprime NP | Cprime QP Cprime -> C """ + '\n' rule_test_c = """S -> DP Period | VP Period DP -> Dprime | Dprime QP | Dprime AP | Dprime CP Dprime -> D NP | NP | D CP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP | Nprime CP Nprime -> N | N PP | PP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV Aprime -> A | A DP VP -> Vprime | Vprime ADV | Vprime DP Vprime -> V | V DP | V PRN | Vprime CP CP -> Cprime | Cprime VP | Cprime DP | Cprime NP Cprime -> C """ + '\n' rule_test = """S -> DP Period | VP Period DP -> Dprime | Dprime QP | Dprime AP Dprime -> D NP | NP NP -> Nprime | Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N | N PP | PP N | N CP PP | PP CP N PP -> Pprime | Pprime ADV | Pprime VP Pprime -> P | P DP AP -> Aprime | Aprime ADV Aprime -> A | A DP VP -> Vprime | Vprime ADV | Vprime DP Vprime -> V | V DP | V PRN | Vprime CP CP -> Cprime | Cprime VP Cprime -> C | C VP | C NP """ + '\n' rule_test_b = """S -> DP VP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime VP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | V CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N CP -> Cprime VP Cprime -> C VP | C NP """ + '\n' rule_abc = """S -> DP Period DP -> Dprime QP | Dprime AP Dprime -> D NP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N | N CP PP | PP CP N PP -> Pprime ADV | Pprime VP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | Vprime CP CP -> Cprime VP Cprime -> C VP | C NP """ + '\n' rule_test_b = """S -> DP VP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime VP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | V CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N CP -> Cprime VP Cprime -> C VP | C NP """ + '\n' rule = """S -> NP VP Sym | VP NP Sym | VP Comma NP | NP Comma VP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime TP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | Vprime CP | V comma DP | V comma PRN | comma Vprime CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N | N Comma PP | PP Comma N | N CP PP | PP CP N TP -> Tprime DP | Tprime Q Tprime -> Tum VP | Tin VP Tprime -> Tma AP Tprime -> Tna- PP Tprime -> Tmay VP Tprime -> Ttaga VP CP -> Cprime TP Cprime -> C TP | C NP | comma C TP | comma C NP""" + '\n' rule_backup = """S -> NP VP | VP NP DP -> Dprime QP | Dprime AP Dprime -> D NP PP -> Pprime ADV | Pprime TP Pprime -> P DP AP -> Aprime ADV Aprime -> A DP VP -> Vprime ADV | Vprime DP Vprime -> V DP | V PRN | Vprime CP NP -> Nprime DP | Nprime PP | Nprime AP | Nprime VP Nprime -> N PP | PP N | N CP PP | PP CP N TP -> Tprime DP | Tprime Q Tprime -> Tum VP | Tin VP Tprime -> Tma AP Tprime -> Tna- PP Tprime -> Tmay VP Tprime -> Ttaga VP CP -> Cprime TP Cprime -> C TP | C NP """ + '\n' i_tag = 0 tag_rule = [] sentence_word_tag = '' #print('tag length: ', len(tag)) while i_tag < len(tag): if "NN" in tag[i_tag]: tag_rule.append('N') elif "PR" in tag[i_tag]: tag_rule.append('N') elif "DT" in tag[i_tag]: tag_rule.append('D') elif "LM" in tag[i_tag]: tag_rule.append('C') elif "CCU" in tag[i_tag]: tag_rule.append('P') elif "CC" in tag[i_tag]: tag_rule.append('C') elif "VB" in tag[i_tag]: tag_rule.append('V') elif "JJ" in tag[i_tag]: tag_rule.append('A') elif "RB" in tag[i_tag]: tag_rule.append('ADV') elif "CD" in tag[i_tag]: tag_rule.append('Q') elif "TS" in tag[i_tag]: tag_rule.append('D') elif "FW" in tag[i_tag]: tag_rule.append('N') elif "PMP" in tag[i_tag]: tag_rule.append('Period') elif "PMC" in tag[i_tag]: tag_rule.append('C') elif "PM" in tag[i_tag]: tag_rule.append('Sym') i_word = 0 word_repeated = False while i_word < i_tag: if word[i_tag] == word[i_word]: word_repeated = True i_word += 1 #print('i_tag: ', i_tag) if not word_repeated: sentence_word_tag += tag_rule[i_tag] + " -> " + "'" + word[i_tag] + "'" + '\n' i_tag += 1 # DP = D' + QP | D' + AP # D' = D + NP # # PP = P' + ADV | P' + TP # P' = P + DP # # AP = A' + ADV # A' = A + DP # # VP = V' + ADV | V' + DP # V' = V + DP ¦ V + PRN ¦ V' + CP # # NP = N' + attribute phrase # N' = N + PP sentence_split = sentence.split() grammar = CFG.fromstring(rule_perphrase_c + sentence_word_tag) # #test uncomment to test english structure # grammar = CFG.fromstring(""" # S -> NP VP # PP -> P NP # NP -> 'the' N | N PP | 'the' N PP # VP -> V NP | V PP | V NP PP # N -> 'cat' # N -> 'dog' # N -> 'rug' # V -> 'chased' # V -> 'sat' # P -> 'in' # P -> 'on'""") # sentence_split = 'the cat chased the dog on the rug'.split() rd = RecursiveDescentParser(grammar) sr = ShiftReduceParser(grammar) chart_parser = nltk.ChartParser(grammar) earley_chart_parser = nltk.EarleyChartParser(grammar) chart_parser = earley_chart_parser print(tag_rule) parse_tree = [] print('Parse') for tree in chart_parser.parse(sentence_split): parse_tree.append(tree) if len(parse_tree) > 0: print(parse_tree[0]) else: print('NO TREE')
PP -> P NP PropN -> 'Bill' | 'Bob' | 'He' Det -> 'the' | 'a' | 'an' | 'An' | 'The' | 'A' | 'on'| 'some' N -> 'bear' | 'squirrel' | 'park' | 'block' | 'table' | 'river' | 'dog' | 'dogs'| 'pasta' | 'anchovies' | 'restaurant' | 'fork' Adj -> 'angry' | 'frightened' | 'furry' V -> 'chased' | 'saw' | 'eats' | 'eat' | 'chase' | 'Put' | 'have' P -> 'on' | 'in' | 'along' | 'with' """) ##sentence1 = "He eats pasta with a fork in the restaurant".split() ##parser1 = nltk.ChartParser(grammar) ##for tree1 in parser1.parse(sentence1): ## # print(tree1) ## print (tree1.draw()) sr = ShiftReduceParser(grammar) sentence1 = "He eats pasta with some anchovies in the restaurant" tokens = nltk.word_tokenize(sentence1) for x in sr.parse(tokens): print(x.draw()) print("-------------------------------------------------------------------") sentence1 = "He eats pasta with some anchovies in the restaurant".split() parser1 = nltk.EarleyChartParser(grammar, trace=2) for tree1 in parser1.parse(sentence1): print(tree1)
P -> 'في'|'الى'|'من'|'عن'|'على' V0 -> 'تفتح'|'فاض'|'ثار'|'هبت'|'جلس'|'ضاع'|'خرج'|'نام'|'وقعد'|'سافر'|'صدق' V1 -> 'طوى'|'أكل'|'بلل'|'زرع'|'أطفأ'|'يركب'|'يستجيب'|'حفظ'|'كتب'|'شاهد'|'قال' V2 -> 'يسقي'|'كسا'|'أعطى'|'ظن'|'حسب'|'جعل'|'خال'|'منح'|'منع'|'ألبس' """) #####RecursiveDescentParser###### tdParser = RecursiveDescentParser(grammar) def rdp(s): for w in tdParser.parse(s.split()): print (w) #####ShiftReduceParser##### srPraser = ShiftReduceParser(grammar_reduced,2) def srp(s): for w in srPraser.parse(s.split()): print (w) #####LeftCornerParser##### lcPraser = LeftCornerChartParser(grammar) def lcp(s): for w in lcPraser.parse(s.split()): print (w) #####EarleyParser##### ePraser = EarleyChartParser(grammar) def ep(s):
def main(test=False): """ makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and serializes them all to disk for future use. The ViterbiParser runs in cubic time and give the most likely parse. The ShiftReduceParser runs in linear time and gives a single parse. https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc https://www.nltk.org/_modules/nltk/grammar.html """ vocabulary = chainer.datasets.get_ptb_words_vocabulary() freq_thresh = 0 ## ARBITRARY word_freqs = FreqDist(ptb.words()) if not os.path.isfile('parsers/grammar.pkl'): productions = [] add_dict = {} # use the entire treebank's parsed sentences to generate the CFG for i, tree in enumerate(ptb.parsed_sents()): # is it a good idea to combine this with my preprocessing? tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) # preprocess all productions by removing all tags these_productions = tree.productions() for production in these_productions: # remove all tags from the LHS (only keep primary tag) production._lhs = preprocess_nt(production._lhs) rhs = [] for item in production._rhs: # remove all tags from the Nonterminals on the RHS if type(item) == nltk.grammar.Nonterminal: rhs.append(preprocess_nt(item)) # replace numbers with N elif is_number(item): rhs.append('N') # items not in dictionary replaced with <unk> # dictionary requires lower elif not is_key(vocabulary, item.lower()): rhs.append('<unk>') # replace infrequent words with <unk> elif word_freqs[item] < freq_thresh: rhs.append('<unk>') # lowercase all entries in the grammar else: rhs.append(item.lower()) production._rhs = tuple(rhs) if not is_key(add_dict, production.unicode_repr()): add_dict[production.unicode_repr()] = True productions.append(production) print('** {} productions found! **'.format(len(productions))) grammar = induce_pcfg(Nonterminal('S'), productions) with open('parsers/grammar.pkl', 'wb') as f: f.write(pickle.dumps(grammar)) if not os.path.isfile('parsers/viterbi_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time with open('parsers/viterbi_parser.pkl', 'wb') as f: f.write(pickle.dumps(viterbi_parser)) if not os.path.isfile('parsers/shift_reduce_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) shift_reduce_parser = ShiftReduceParser(grammar, trace=0) # linear time with open('parsers/shift_reduce_parser.pkl', 'wb') as f: f.write(pickle.dumps(shift_reduce_parser)) with open('data/ptb.train.txt', 'r') as f: data = f.readlines() if test: for sample in [1, 23, 20330, 20332, 443]: t1 = time.time() viterbi_parser.parse_one(data[sample].split()) t2 = time.time() print('viterbi = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split()))) t1 = time.time() shift_reduce_parser.parse_one(data[sample].split()) t2 = time.time() print('shift reduce = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split())))
VP -> V NP | V Det -> 'The' | 'a' |'an' N -> 'bear'|'squirrel'|'dog' NP -> N V -> 'eat' | 'eats' """) cp = nltk.ChartParser(grammar) sentence = [s.split() for s in ['The bear eat an squirrel', 'The dog eats']] for s in sentence: for node in cp.parse(s): print(''.join(s)) print(node) print(node.draw()) print('\n------------\n') from nltk.parse import ShiftReduceParser grammar = CFG.fromstring(""" S -> NP VP NP -> Det N VP -> V NP | V Det -> 'The' | 'a' |'an' N -> 'bear'|'squirrel'|'dog' NP -> N V -> 'eat' | 'eats' """) #using Shift Reduce Parser sr = ShiftReduceParser(grammar) sentence = [s.split() for s in ['The bear eat an squirrel', 'The dog eats']] for s in sentence: for node in sr.parse(s): print(''.join(s)) print(node) print(node.draw()) print('\n------------\n')