def parseSentence(toks, grammarWoTerm, cfg): if cfg: gr = grammar.parse_cfg(grammarWoTerm) parser = parse.BottomUpChartParser(gr) else: termRules=[] for kr in toks: termRules.append(kr2terminals.getRuleFromKr(kr)) fullGrammar = '\n'.join(grammarWoTerm+termRules) gr = grammar.parse_fcfg(fullGrammar) parser = parse.FeatureBottomUpChartParser(gr) chart = parser.chart_parse(toks) return chart
def main(grammarFile='', text='', serialize=None, uncrossing=False, countCrosses=False, max_maxNpTag=False, b_baseNpTag=False, findWrongChunks=False, useCfg=False): sys.stderr.write('loading nltk...') corp = splitCorp(text) cLength=len(corp) grammarText = [l.strip('\n') for l in file(grammarFile)] grammarText += getTerminals(corp) grammarObj = grammar.parse_fcfg(grammarText) parser = parse.FeatureBottomUpChartParser(grammarObj) sys.stderr.write('done!\nparsing...') if serialize: parseOut = open(serialize, 'w') allCrosses=0.0 allSens=0.0 for c,sen in enumerate(corp): allSens+=1 if c*100/cLength>(c-1)*100/cLength: sys.stderr.write(str(c*100/cLength)+'% ') toks = [l.strip('\n').split()[1] for l in sen] words = [l.strip('\n').split()[0] for l in sen] chart = parser.chart_parse(toks) if max_maxNpTag: tagMaxNPs(chart, sen, useCfg) print if b_baseNpTag: tagBaseNPs(chart, sen) print if findWrongChunks: getWrongChunks(chart, sen) if serialize: serializeParse(chart, toks, words, parseOut) if uncrossing: findUncrossingNPs(chart, words, useCfg) if countCrosses: allCrosses+=findUncrossingNPs(chart, words, useCfg, countCrosses) sys.stderr.write('done\n') if countCrosses: print 'No. of sentences:', str(allSens) print 'No. of crosses:', str(allCrosses) print 'Average no. of crosses per sentence:', print allCrosses/allSens
def demo_legacy_grammar(): """ Check that batch_interpret() is compatible with legacy grammars that use a lowercase 'sem' feature. Define 'test.fcfg' to be the following """ from nltk.grammar import parse_fcfg g = parse_fcfg(""" % start S S[sem=<hello>] -> 'hello' """) print "Reading grammar: %s" % g print "*" * 20 for reading in batch_interpret(['hello'], g, semkey='sem'): syn, sem = reading[0] print print "output: ", sem
def demo_legacy_grammar(): """ Check that batch_interpret() is compatible with legacy grammars that use a lowercase 'sem' feature. Define 'test.fcfg' to be the following """ from nltk.grammar import parse_fcfg g = parse_fcfg(""" % start S S[sem=<hello>] -> 'hello' """) print("Reading grammar: %s" % g) print("*" * 20) for reading in batch_interpret(['hello'], g, semkey='sem'): syn, sem = reading[0] print() print("output: ", sem)
def demo_grammar(): from nltk.grammar import parse_fcfg return parse_fcfg(""" S -> NP VP PP -> Prep NP NP -> NP PP VP -> VP PP VP -> Verb NP VP -> Verb NP -> Det[pl=?x] Noun[pl=?x] NP -> "John" NP -> "I" Det -> "the" Det -> "my" Det[-pl] -> "a" Noun[-pl] -> "dog" Noun[-pl] -> "cookie" Verb -> "ate" Verb -> "saw" Prep -> "with" Prep -> "under" """)
def load(resource_url, format='auto', cache=True, verbose=False, logic_parser=None, fstruct_parser=None): """ Load a given resource from the NLTK data package. The following resource formats are currently supported: - C{'pickle'} - C{'yaml'} - C{'cfg'} (context free grammars) - C{'pcfg'} (probabilistic CFGs) - C{'fcfg'} (feature-based CFGs) - C{'fol'} (formulas of First Order Logic) - C{'logic'} (Logical formulas to be parsed by the given logic_parser) - C{'val'} (valuation of First Order Logic model) - C{'raw'} If no format is specified, C{load()} will attempt to determine a format based on the resource name's file extension. If that fails, C{load()} will raise a C{ValueError} exception. @type resource_url: C{str} @param resource_url: A URL specifying where the resource should be loaded from. The default protocol is C{"nltk:"}, which searches for the file in the the NLTK data package. @type cache: C{bool} @param cache: If true, add this resource to a cache. If C{load} finds a resource in its cache, then it will return it from the cache rather than loading it. The cache uses weak references, so a resource wil automatically be expunged from the cache when no more objects are using it. @type verbose: C{bool} @param verbose: If true, print a message when loading a resource. Messages are not displayed when a resource is retrieved from the cache. @type logic_parser: C{LogicParser} @param logic_parser: The parser that will be used to parse logical expressions. @type fstruct_parser: C{FeatStructParser} @param fstruct_parser: The parser that will be used to parse the feature structure of an fcfg. """ # If we've cached the resource, then just return it. if cache: resource_val = _resource_cache.get(resource_url) if resource_val is not None: if verbose: print '<<Using cached copy of %s>>' % (resource_url, ) return resource_val # Let the user know what's going on. if verbose: print '<<Loading %s>>' % (resource_url, ) # Determine the format of the resource. if format == 'auto': resource_url_parts = resource_url.split('.') ext = resource_url_parts[-1] if ext == 'gz': ext = resource_url_parts[-2] format = AUTO_FORMATS.get(ext) if format is None: raise ValueError('Could not determine format for %s based ' 'on its file\nextension; use the "format" ' 'argument to specify the format explicitly.' % resource_url) # Load the resource. if format == 'pickle': resource_val = pickle.load(_open(resource_url)) elif format == 'yaml': resource_val = yaml.load(_open(resource_url)) elif format == 'cfg': resource_val = cfg.parse_cfg(_open(resource_url).read()) elif format == 'pcfg': resource_val = cfg.parse_pcfg(_open(resource_url).read()) elif format == 'fcfg': resource_val = cfg.parse_fcfg(_open(resource_url).read(), logic_parser=logic_parser, fstruct_parser=fstruct_parser) elif format == 'fol': resource_val = sem.parse_logic(_open(resource_url).read(), logic_parser=sem.logic.LogicParser()) elif format == 'logic': resource_val = sem.parse_logic(_open(resource_url).read(), logic_parser=logic_parser) elif format == 'val': resource_val = sem.parse_valuation(_open(resource_url).read()) elif format == 'raw': resource_val = _open(resource_url).read() else: assert format not in FORMATS raise ValueError('Unknown format type!') # If requested, add it to the cache. if cache: try: _resource_cache[resource_url] = resource_val except TypeError: # We can't create weak references to some object types, like # strings and tuples. For now, just don't cache them. pass return resource_val
def load_grammar(fn): g = parse_fcfg(open(fn)) g.parser = FeatureChartParser(g) g.parse = g.parser.parse return g