def demo2(): from nltk import Nonterminal, Production, ContextFreeGrammar nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ['up', 'over', NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = ContextFreeGrammar(S, productions) text = 'I saw a man in the park'.split() d = CFGDemo(grammar, text) d.mainloop()
def if_then_else_demo(): """ Demo if-then-else grammar """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar nonterminals = 'E E1 PLUS T T1 TIMES F LPAREN RPAREN ID' (E, E1, PLUS, T, T1, TIMES, F, LPAREN, RPAREN, ID) = [Nonterminal(s) for s in nonterminals.split()] productions = ( Production(E, [T, E1]), Production(E1, [PLUS, T, E1]), Production(E1, []), Production(T, [F, T1]), Production(T1, [TIMES, F, T1]), Production(T1, []), Production(F, [LPAREN, E, RPAREN]), Production(F, [ID]), Production(PLUS, ['+']), Production(TIMES, ['*']), Production(LPAREN, ['(']), Production(RPAREN, [')']), Production(ID, ['a']), Production(ID, ['b']), Production(ID, ['c']), ) grammar = ContextFreeGrammar(E, productions) text = "a * b + c".split() RecursiveDescentApp(grammar, text).mainloop()
def add_new_vocab_rule(self, rule): """ Adds a new vocabulary rule to the set of rules, and recreates self.cfg and self.parser. """ self.rules.append(Production(NT(rule[0]), rule[1])) self.cfg = ContextFreeGrammar(NT("S"), self.rules) self.parser = EarleyChartParser(self.cfg, trace=0)
def set_grammar(self, grammar): """ Asign a new grammar to the parser Args: - parser_args: needs grammar of type :class:`ContextFreeGrammar` """ self._grammar = ContextFreeGrammar(Nonterminal('S'), grammar)
def parse_NP(self, sen): """ Parses a partial sentence (that is, usually a noun phrase. Returns the parse, or returns a tuple. """ try: cfg_temp = ContextFreeGrammar(NT("NP"), self.rules) parser_temp = EarleyChartParser(cfg_temp, trace=0) parse = parser_temp.nbest_parse(sen.strip().split(" "), trace=0) except: print traceback.format_exc() else: if parse: return parse[0] print "failure" return None
def fail_demo(): """ Demo grammar that should not work with backtracking for all inputs """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar S = Nonterminal('S') A = Nonterminal('A') productions = ( Production(S, [ A, S, A ]), Production(S, [ A, A ]), Production(A, [ 'a' ]), ) grammar = ContextFreeGrammar(S, productions) text = "a a a a a a".split() #text = "a a a a".split() RecursiveDescentApp(grammar, text).mainloop()
def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = ContextFreeGrammar(Nonterminal('S'), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side='top', fill='x', expand=0) self._init_prodframe() self._prodframe.pack(side='top', fill='both', expand=1) self._init_buttons() self._buttonframe.pack(side='bottom', fill='x', expand=0) self._textwidget.focus()
def app(): """ Create a shift reduce parser app, using a simple grammar and text. """ from nltk.grammar import Nonterminal, Production, ContextFreeGrammar nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = ContextFreeGrammar(S, productions) # tokenize the sentence sent = 'my dog saw a man in the park with a statue'.split() ShiftReduceApp(grammar, sent).mainloop()
def __init__(self, rules_file="rules.gr", vocab_file="vocabulary.gr"): """ Reads in grammar rules (from rules_file) and vocab rules (from vocab_file) and creates self.cfg (a ContextFreeGrammar) and self.parser (a EarleyChartParser). """ self.rules = [] test_sentences = [] # get the rules from rules_file grammar = open(rules_file, "r") line = grammar.readline() while line: if line.strip() != "" and not line.strip().startswith("#"): line = line[2:] parts = line.partition("\t") lhs = parts[0].strip() rhs = [NT(x) for x in parts[2].strip().split(" ")] self.rules.append(Production(NT(lhs), rhs)) line = grammar.readline() grammar.close() # get the rules from vocab_file vocab = open(vocab_file, "r") line = vocab.readline() while line: if line.strip() != "" and not line.strip().startswith("#"): line = line[2:] parts = line.partition("\t") lhs = parts[0].strip() rhs = parts[2].strip().lower().split(" ") self.rules.append(Production(NT(lhs), rhs)) line = vocab.readline() vocab.close() # create the grammar and parser self.cfg = ContextFreeGrammar(NT("S"), self.rules) self.parser = EarleyChartParser(self.cfg, trace=0)
def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = ContextFreeGrammar(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg)
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp_tokenize(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain Production(O, [D]), Production(O, [H]), Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain Production(H, [D, '/', D]), # domain, forward slash, another operator Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = ContextFreeGrammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
import nltk from nltk.corpus import treebank from nltk.grammar import ContextFreeGrammar, Nonterminal tbank_productions = set(production for sent in treebank.parsed_sents() for production in sent.productions()) tbank_grammar = ContextFreeGrammar(Nonterminal('S'), list(tbank_productions)) print tbank_grammar