def demo(): """ A demonstration showing how C{Grammar}s can be created and used. """ from nltk import cfg # Create some nonterminals S, NP, VP, PP = cfg.nonterminals('S, NP, VP, PP') N, V, P, Det = cfg.nonterminals('N, V, P, Det') VP_slash_NP = VP / NP print 'Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP] print ' S.symbol() =>', ` S.symbol() ` print print cfg.Production(S, [NP]) # Create some Grammar Productions grammar = cfg.parse_grammar(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' N -> 'dog' N -> 'cat' V -> 'chased' V -> 'sat' P -> 'on' P -> 'in' """) print 'A Grammar:', ` grammar ` print ' grammar.start() =>', ` grammar.start() ` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print ` grammar.productions() `.replace(',', ',\n' + ' ' * 25) print
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = cfg.nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain cfg.Production(O, [D]), cfg.Production(O, [H]), cfg.Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" cfg.Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain cfg.Production(H, [D, '/', D]), # domain, forward slash, another operator cfg.Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = cfg.Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = cfg.Grammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
def demo(): """ Create a shift reduce parser demo, using a simple grammar and text. """ from nltk import cfg nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP, PP]), cfg.Production(VP, [V, NP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['statue']), cfg.Production(Det, ['my']), ) grammar = cfg.Grammar(S, productions) # tokenize the sentence sent = 'my dog saw a man in the park with a statue'.split() ShiftReduceDemo(grammar, sent).mainloop()
def demo(): import sys, time S = GrammarCategory.parse('S') VP = GrammarCategory.parse('VP') NP = GrammarCategory.parse('NP') PP = GrammarCategory.parse('PP') V = GrammarCategory.parse('V') N = GrammarCategory.parse('N') P = GrammarCategory.parse('P') Name = GrammarCategory.parse('Name') Det = GrammarCategory.parse('Det') DetSg = GrammarCategory.parse('Det[-pl]') DetPl = GrammarCategory.parse('Det[+pl]') NSg = GrammarCategory.parse('N[-pl]') NPl = GrammarCategory.parse('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V, )), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg)) ] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John', )), cfg.Production(NP, ('I', )), cfg.Production(Det, ('the', )), cfg.Production(Det, ('my', )), cfg.Production(Det, ('a', )), cfg.Production(NSg, ('dog', )), cfg.Production(NSg, ('cookie', )), cfg.Production(V, ('ate', )), cfg.Production(V, ('saw', )), cfg.Production(P, ('with', )), cfg.Production(P, ('under', )), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) def lexicon(word): return earley_lexicon.get(word.upper(), []) sent = 'I saw John with a dog with my cookie' print "Sentence:\n", sent from nltk import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1) trees = cp.get_parse_list(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree
def demo(): import sys, time S = FeatStructNonterminal('S') VP = FeatStructNonterminal('VP') NP = FeatStructNonterminal('NP') PP = FeatStructNonterminal('PP') V = FeatStructNonterminal('V') N = FeatStructNonterminal('N') P = FeatStructNonterminal('P') Name = FeatStructNonterminal('Name') Det = FeatStructNonterminal('Det') DetSg = FeatStructNonterminal('Det[-pl]') DetPl = FeatStructNonterminal('Det[+pl]') NSg = FeatStructNonterminal('N[-pl]') NPl = FeatStructNonterminal('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V, )), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg)) ] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John', )), cfg.Production(NP, ('I', )), cfg.Production(Det, ('the', )), cfg.Production(Det, ('my', )), cfg.Production(Det, ('a', )), cfg.Production(NSg, ('dog', )), cfg.Production(NSg, ('cookie', )), cfg.Production(V, ('ate', )), cfg.Production(V, ('saw', )), cfg.Production(P, ('with', )), cfg.Production(P, ('under', )), ] earley_lexicon = defaultdict(list) for prod in lexical_productions: earley_lexicon[prod.rhs()[0]].append(prod.lhs()) #print "Lexicon:" #print earley_lexicon earley_grammar = cfg.Grammar(S, grammatical_productions, earley_lexicon) print earley_grammar sent = 'I saw John with a dog with my cookie' print "Sentence:\n", sent tokens = sent.split() t = time.time() cp = FeatureEarleyChartParser(earley_grammar, trace=1) trees = cp.nbest_parse(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree