def demo(): """ A demonstration of the recursive descent parser. """ from nltk import parse, CFG grammar = CFG.fromstring( """ S -> NP VP NP -> Det N | Det N PP VP -> V NP | V NP PP PP -> P NP NP -> 'I' N -> 'man' | 'park' | 'telescope' | 'dog' Det -> 'the' | 'a' P -> 'in' | 'with' V -> 'saw' """ ) for prod in grammar.productions(): print(prod) sent = "I saw a man in the park".split() parser = parse.RecursiveDescentParser(grammar, trace=2) for p in parser.parse(sent): print(p)
def parse(self, p_string): """ Parses a string and stores the resulting hierarchy of "domains" "hierarchies" and "tables" For the sake of NLP I've parsed the string using the nltk context free grammar library. A query is a "sentence" and can either be a domain, hierarchy or a table. A domain is simply a word. A hierarchy is expressed as "domain/domain" A table is exressed as "table(sentence, sentence, sentence)" Internally the query is represented as a nltk.parse.tree Process: 1. string is tokenized 2. develop a context free grammar 3. parse 4. convert to a tree representation """ self.nltktree = None # Store the query string self.string = p_string # Tokenize the query string, allowing only strings, parentheses, # forward slashes and commas. re_all = r'table[(]|\,|[)]|[/]|\w+' data_tokens = tokenize.regexp_tokenize(self.string, re_all) # Develop a context free grammar # S = sentence, T = table, H = hierarchy, D = domain O, T, H, D = nonterminals('O, T, H, D') # Specify the grammar productions = ( # A sentence can be either a table, hierarchy or domain Production(O, [D]), Production(O, [H]), Production(O, [T]), # A table must be the following sequence: # "table(", sentence, comma, sentence, comma, sentence, ")" Production(T, ['table(', O, ',', O, ',', O, ')']), # A hierarchy must be the following sequence: # domain, forward slash, domain Production(H, [D, '/', D]), # domain, forward slash, another operator Production(H, [D, '/', O])) # Add domains to the cfg productions # A domain is a token that is entirely word chars re_domain = compile(r'^\w+$') # Try every token and add if it matches the above regular expression for tok in data_tokens: if re_domain.match(tok): prod = Production(D, [tok]), productions = productions + prod # Make a grammar out of our productions grammar = ContextFreeGrammar(O, productions) rd_parser = parse.RecursiveDescentParser(grammar) # Tokens need to be redefined. # It disappears after first use, and I don't know why. tokens = tokenize.regexp_tokenize(self.string, re_all) toklist = list(tokens) # Store the parsing. # Only the first one, as the grammar should be completely nonambiguous. try: self.parseList = rd_parser.get_parse_list(toklist)[0] except IndexError: print "Could not parse query." return # Set the nltk.parse.tree tree for this query to the global sentence string = str(self.parseList) string2 = string.replace(":", "").replace("')'", "").replace( "table(", "").replace("','", "").replace("'", "").replace("/", "") self.nltktree = parse.tree.bracket_parse(string2) # Store the resulting nltk.parse.tree tree self.parseTree = QuerySentence(self.nltktree) self.xml = self.parseTree.toXML()
# min for the float is 7.981037055632809e-06 and scientific notation not allowed... split_rule[2] = "%.10f" % exp(float(split_rule[2])) # float_prob = exp(float(split_rule[2])) arabic_pcfg[i] = split_rule[0] + " -> " + split_rule[1] + " [" + split_rule[2] + "]" # update arabic_pcfg inplace distinct_words.symmetric_difference(distinct_words_from_pcfg) arabic_pcfg = "\n".join(arabic_pcfg) grammar = nltk.PCFG.fromstring(arabic_pcfg) test_sent = read_from_file("dev_sents") test_sent = [test[0].split(" ") for test in test_sent] from nltk import parse, pchart parser = parse.RecursiveDescentParser(grammar, trace=2) # for p in parser.parse(test_sent[0]): # print(p) random_parser = pchart.RandomChartParser(grammar) inside_parser = pchart.InsideChartParser(grammar) for p in inside_parser.parse(test_sent[0]): print(p) # # list(parser.parse(test)) # output = " ".join([str(p) for p in parser.parse(test)]) # # # Not working aon