def test_production_from_grammar(self): grammar_str = """ S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """ grammar = parse_cfg(grammar_str) productions = grammar.productions() expect_production = Production( lhs=Nonterminal("S"), rhs=[Nonterminal("NP"), Nonterminal("VP")]) error_msg = "Expect to find '{}', but can not see in \n{}".format( expect_production, grammar_str) self.assertIn(expect_production, productions, error_msg) expect_production = Production(lhs=Nonterminal("N"), rhs=['dog']) error_msg = "Expect to find '{}', but can not see in \n{}".format( expect_production, grammar_str) self.assertIn(expect_production, productions, error_msg) expect_not_in = Production(lhs="S", rhs=["NP", "VP"]) self.assertNotIn(expect_not_in, productions, error_msg) expect_not_in = Production(lhs=Nonterminal("N"), rhs=["'dog'"]) self.assertNotIn(expect_not_in, productions, error_msg)
def demo(): """ A demonstration of the recursive descent parser. """ from nltk import parse, parse_cfg grammar = parse_cfg(""" S -> NP VP NP -> Det N | Det N PP VP -> V NP | V NP PP PP -> P NP NP -> 'I' N -> 'man' | 'park' | 'telescope' | 'dog' Det -> 'the' | 'a' P -> 'in' | 'with' V -> 'saw' """) for prod in grammar.productions(): print(prod) sent = 'I saw a man in the park'.split() parser = parse.RecursiveDescentParser(grammar, trace=2) for p in parser.nbest_parse(sent): print(p)
def app(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from nltk.grammar import parse_cfg grammar = parse_cfg( """ # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """ ) sent = "the dog saw a man in the park".split() RecursiveDescentApp(grammar, sent).mainloop()
def demo(): N = 42 print('Generating the first %d sentences for demo grammar:' % (N, )) print(demo_grammar) grammar = parse_cfg(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print('%3d. %s' % (n, ' '.join(sent)))
def demo(): N = 42 print('Generating the first %d sentences for demo grammar:' % (N,)) print(demo_grammar) grammar = parse_cfg(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print('%3d. %s' % (n, ' '.join(sent)))
def _generate_demo(): g = parse_cfg(""" S -> NP VP NP -> Det N VP -> V NP Det -> 'the' Det -> 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'saw' | 'walked' P -> 'in' | 'with' """) for s in generate(g): print ' '.join(s)
def _generate_demo(): g = parse_cfg(""" S -> NP VP NP -> Det N VP -> V NP Det -> 'the' Det -> 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'saw' | 'walked' P -> 'in' | 'with' """) for s in generate(g): print(' '.join(s))
def generateRawTemplates(depth): gram = parse_cfg(grammarstring) rawTemplates = generate(gram, depth=depth) templatefiles = [] for index, state in enumerate(rawTemplates): filename = os.path.join("./templates","template"+str(index)) with open(filename, 'w') as templatefile: templatefile.write(' '.join(state)) templatefiles.append(filename) print str(len(rawTemplates))+" template files generated" return templatefiles
def parseSentence(toks, grammarWoTerm, cfg): if cfg: gr = grammar.parse_cfg(grammarWoTerm) parser = parse.BottomUpChartParser(gr) else: termRules=[] for kr in toks: termRules.append(kr2terminals.getRuleFromKr(kr)) fullGrammar = '\n'.join(grammarWoTerm+termRules) gr = grammar.parse_fcfg(fullGrammar) parser = parse.FeatureBottomUpChartParser(gr) chart = parser.chart_parse(toks) return chart
def __init__(self, grammar, length=1): """Convert the grammar to Chomsky Normal Form and do preprocessing. `grammar` can be: (1) an instance of nltk.grammar.ContextFreeGrammar, (2) a string representing the path to a .cfg file, or (3) a string that can be parsed into a grammar by parse_cfg `length` is the maximum string length that should be preprocessed. """ if length < 1: raise ValueError('length must be greater than 0.') # self.grammar must be instance of nltk.grammar.Grammar if isinstance(grammar, ContextFreeGrammar): self.grammar = grammar elif isinstance(grammar, str) and grammar.endswith('.cfg'): self.grammar = nltk.data.load('file:' + grammar) elif isinstance(grammar, str): self.grammar = parse_cfg(grammar) else: raise ValueError('Arg grammar must be nltk.grammar.Grammar or str.') if not self.grammar.is_chomsky_normal_form(): #raise ValueError('Input grammar must be in CNF ' # '(conversion method isn\'t implemented)') self.grammar = convert_to_cnf(self.grammar) assert self.grammar.is_chomsky_normal_form() self.productions = self.grammar.productions() # TODO: Is it ok to assume all nonterminals occur on a LHS? # Technically yes, but check whether nltk's is_cnf ensures it. self.nonterminals = set([p.lhs() for p in self.productions]) self.terminals = set([token for prod in self.productions for token in prod.rhs() if not isinstance(token, Nonterminal)]) # Initialize self._counts then populate it in _preprocess(). # self.length is the string length that has been preprocessed. self._counts = {} self.length = 0 self._preprocess(length)
def app(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from nltk.grammar import parse_cfg grammar = parse_cfg(""" # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """) sent = 'the dog saw a man in the park'.split() RecursiveDescentApp(grammar, sent).mainloop()
def demo(): """ A demonstration of the shift-reduce parser. """ from nltk import parse, parse_cfg grammar = parse_cfg(""" S -> NP VP NP -> Det N | Det N PP VP -> V NP | V NP PP PP -> P NP NP -> 'I' N -> 'man' | 'park' | 'telescope' | 'dog' Det -> 'the' | 'a' P -> 'in' | 'with' V -> 'saw' """) sent = 'I saw a man in the park'.split() parser = parse.ShiftReduceParser(grammar, trace=2) for p in parser.nbest_parse(sent): print p
def test_from_cfg(self): grammar = parse_cfg("""S -> 's' | A B\n A -> 'a'\n B -> 'b'""")
def __init__(self, Tagfile = "dummy.crf", Grammar = "dummy.cfg", Promptings = "dummy.prmz", Defsfile = "dummy.best",\ Topic = "Potpourri", IM = False, TRON = 1): """Initialize the grammar by first loading in the file of acceptible Evidence Implies Inference (EII) format parses, then reading in and reformatting the input sentence, before testing its acceptibility and printing output accordingly.""" if (TRON > 2): print Tagfile, Grammar, Promptings, Defsfile, Topic, IM, TRON print Tagfile, Grammar, Promptings, Defsfile, Topic, IM, TRON ## Zerost, set the file and parameter defaults. self.tagfile = Tagfile self.grammar = Grammar self.promptings = Promptings self.defsfile = Defsfile self.old_topic = Topic self.for_IM = IM self.nolines = 0 self.N = 4 self.new_topic = self.old_topic self.noun_tags = ["NN", "NNS", "FW"] self.promptText = "\tUSER> " ## Flag for whether this is the initial run, used for resolving word definitions. initial = True altered = dict() words = dict() ## First, read in the grammar file. str = '' gramfile = open(self.grammar, 'r') for line in gramfile.readlines(): str += line gramfile.close() ## Then, initialize the tagger. # HMMT = hmmtagger.getHMMTagger(self.tagfile, self.nolines) # self.HMMTagger = HMMT.trainHMM() (self.SeqTagger, self.train_data) = self.getTagger(self.tagfile, self.nolines, self.N) if (TRON > 1): self.write("This tagger is %0.4f%% accurate." % self.score()) ## Next, we will read in the ambiguous sentence prompts and conversions. (self.messages, self.transforms) = self.init_messages(self.promptings) self.ideal = self.getBest(self.defsfile) received = "" playing = True self.write("Welcome to Revelator!\n") while (playing): ## After that, read in the play or commands from the user. For now, we will take only one at a time. ## TODO 1: Adapt for multiple sentences per input. Consider batch_parse and the like. ## We will keep taking input until the user wishes to quit. received = self.getPlay(self.promptText) ## Skip the rest of stuff if the user just types in a command. ternary = self.aCommand(received) if (ternary != 0): self.write("") if (ternary == -1): playing = False continue ## And this lets us self-define words. # (received, words, altered) = define.paran(received, words, altered, self.ideal) ## We can self-define words in this part of the code. ## N. B. that we will tokenize and tag the input twice, once before and once after, so that we can extract ## the paranthetical definitiions and not have them affect the final tagging. (words, altered, allNPs, received) = self.call_paranNP(received, words, altered, self.ideal, TRON > 2) ## Bail-out code: # playing = False # print received # continue ## Add the terminals to the grammar. We will parse the if- and then-clauses separately. (if_clause, then_clause, str, pos_if, pos_then) = self.listen(str, received) if (TRON > 2): print "IF: ", if_clause print "THEN: ", then_clause ## This gets rid of unnecessary punctuation. # if_clause = self.strip_punc(if_clause) # then_clause = self.strip_punc(then_clause) ## Now, we get to the grammar. EII_grammar = grammar.parse_cfg(str) self.EII_Earley = EarleyChartParser(EII_grammar, trace = TRON) ## Following that, we (re)define common nouns. (altered, self.ideal, words) = self.get_defs(if_clause.split(), pos_if.split(), altered, self.ideal, words, initial, allNPs) (altered, self.ideal, words) = self.get_defs(then_clause.split(), pos_then.split(), altered, self.ideal, words, initial, allNPs) ## Verify the defintions (debug only). # for NP in allNPs.values(): # print "NP", NP, "is defined as", words[NP], "." ## Then, test this sentence against our grammar. ## IF clause: valid_if = self.parse(if_clause, pos_if, "evidence") ## THEN clause: valid_then = self.parse(then_clause, pos_then, "inference") initial = False ## If valid, print a copy of the evidence and inference to STDOUT. Otherwise, tell the user what was wrong. if (valid_if and valid_then): self.printEI(if_clause, then_clause, altered, words, allNPs) elif (valid_if): self.write("I was able to understand your evidential statement, but your inferential statement did not parse. Could you please restate\nyour entire play?") else: self.write("I was able to understand your statement of inference, but your statement of evidence did not parse. Could you please restate\nyour entire play?") ## Reset defaults. self.promptText = "\tUSER> " self.write("Have a nice day! Come play again sometime soon!\n")
for frag in _multiply(frag1, frag2): frags.append(frag) return frags def _multiply(frag1, frag2): frags = [] if len(frag1) == 1: frag1 = [frag1] if len(frag2) == 1: frag2 = [frag2] for f1 in frag1: for f2 in frag2: frags.append(f1 + f2) return frags grammar = parse_cfg(""" S -> NP VP NP -> Det N VP -> V NP Det -> 'the' Det -> 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'saw' | 'walked' P -> 'in' | 'with' """) for sent in generate(grammar): print ' '.join(sent)
def load(resource_url, format='auto', cache=True, verbose=False, logic_parser=None, fstruct_parser=None): """ Load a given resource from the NLTK data package. The following resource formats are currently supported: - C{'pickle'} - C{'yaml'} - C{'cfg'} (context free grammars) - C{'pcfg'} (probabilistic CFGs) - C{'fcfg'} (feature-based CFGs) - C{'fol'} (formulas of First Order Logic) - C{'logic'} (Logical formulas to be parsed by the given logic_parser) - C{'val'} (valuation of First Order Logic model) - C{'raw'} If no format is specified, C{load()} will attempt to determine a format based on the resource name's file extension. If that fails, C{load()} will raise a C{ValueError} exception. @type resource_url: C{str} @param resource_url: A URL specifying where the resource should be loaded from. The default protocol is C{"nltk:"}, which searches for the file in the the NLTK data package. @type cache: C{bool} @param cache: If true, add this resource to a cache. If C{load} finds a resource in its cache, then it will return it from the cache rather than loading it. The cache uses weak references, so a resource wil automatically be expunged from the cache when no more objects are using it. @type verbose: C{bool} @param verbose: If true, print a message when loading a resource. Messages are not displayed when a resource is retrieved from the cache. @type logic_parser: C{LogicParser} @param logic_parser: The parser that will be used to parse logical expressions. @type fstruct_parser: C{FeatStructParser} @param fstruct_parser: The parser that will be used to parse the feature structure of an fcfg. """ # If we've cached the resource, then just return it. if cache: resource_val = _resource_cache.get(resource_url) if resource_val is not None: if verbose: print '<<Using cached copy of %s>>' % (resource_url, ) return resource_val # Let the user know what's going on. if verbose: print '<<Loading %s>>' % (resource_url, ) # Determine the format of the resource. if format == 'auto': resource_url_parts = resource_url.split('.') ext = resource_url_parts[-1] if ext == 'gz': ext = resource_url_parts[-2] format = AUTO_FORMATS.get(ext) if format is None: raise ValueError('Could not determine format for %s based ' 'on its file\nextension; use the "format" ' 'argument to specify the format explicitly.' % resource_url) # Load the resource. if format == 'pickle': resource_val = pickle.load(_open(resource_url)) elif format == 'yaml': resource_val = yaml.load(_open(resource_url)) elif format == 'cfg': resource_val = cfg.parse_cfg(_open(resource_url).read()) elif format == 'pcfg': resource_val = cfg.parse_pcfg(_open(resource_url).read()) elif format == 'fcfg': resource_val = cfg.parse_fcfg(_open(resource_url).read(), logic_parser=logic_parser, fstruct_parser=fstruct_parser) elif format == 'fol': resource_val = sem.parse_logic(_open(resource_url).read(), logic_parser=sem.logic.LogicParser()) elif format == 'logic': resource_val = sem.parse_logic(_open(resource_url).read(), logic_parser=logic_parser) elif format == 'val': resource_val = sem.parse_valuation(_open(resource_url).read()) elif format == 'raw': resource_val = _open(resource_url).read() else: assert format not in FORMATS raise ValueError('Unknown format type!') # If requested, add it to the cache. if cache: try: _resource_cache[resource_url] = resource_val except TypeError: # We can't create weak references to some object types, like # strings and tuples. For now, just don't cache them. pass return resource_val
def demo(): print('Generating all sentences for demo grammar:') print(demo_grammar) grammar = parse_cfg(demo_grammar) for sent in generate(grammar): print(' '.join(sent))
def _multiply(frag1, frag2): frags = [] if len(frag1) == 1: frag1 = [frag1] if len(frag2) == 1: frag2 = [frag2] for f1 in frag1: for f2 in frag2: frags.append(f1+f2) return frags grammar = parse_cfg( """ S -> NP VP NP -> Det N | Pro NP_PP -> Det N_PP PP -> P NP_PP VP -> V PP Det -> 'the' Det -> 'a' Pro -> 'he' | 'she' | 'they' | 'we' N -> 'man' | 'boy' | 'person' | 'woman' | 'girl' N_PP -> 'store' | 'supermarket' V -> 'went' | 'walked' | 'drove' | 'ran' P -> 'to' """) for sent in generate(grammar): print ' '.join(sent)
for frag2 in _generate_all(grammar, items[1:]): for frag in _multiply(frag1, frag2): frags.append(frag) return frags def _multiply(frag1, frag2): frags = [] if len(frag1) == 1: frag1 = [frag1] if len(frag2) == 1: frag2 = [frag2] for f1 in frag1: for f2 in frag2: frags.append(f1+f2) return frags grammar = parse_cfg(""" S -> NP VP NP -> Det N VP -> V NP Det -> 'the' Det -> 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'saw' | 'walked' P -> 'in' | 'with' """) for sent in generate(grammar): print ' '.join(sent)
def _generate_all(grammar, items, depth): if items: for frag1 in _generate_one(grammar, items[0], depth): for frag2 in _generate_all(grammar, items[1:], depth): yield frag1 + frag2 else: yield [] def _generate_one(grammar, item, depth): if depth > 0: if isinstance(item, Nonterminal): for prod in grammar.productions(lhs=item): for frag in _generate_all(grammar, prod.rhs(), depth-1): yield frag else: yield [item] radio_grammar = """ S -> NP VP PP NP -> NNP Det Det NNP VP -> 'is' Det N N Det PP -> JJ JJ NNP NNP -> 'In Rainbows' | 'Basement' | 'Radiohead' JJ -> 'English' | 'alternative' | 'rock' | 'band' Det -> 'the' | 'a' | 'by' | 'from' N -> 'video' | 'album' | 'rock' | 'band' """ grammar = parse_cfg(radio_grammar) for n, sent in enumerate(generate(grammar, n=100), 1): print('%3d. %s' % (n, ' '.join(sent)))