def _chunk_parse(self, grammar=None, top_node='record', trace=0, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file parsed according to the chunk grammar. @type grammar: C{string} @param grammar: Contains the chunking rules used to parse the database. See L{chunk.RegExp} for documentation. @type top_node: C{string} @param top_node: The node value that should be used for the top node of the chunk structure. @type trace: C{int} @param trace: The level of tracing that should be used when parsing a text. C{0} will generate no tracing output; C{1} will generate normal tracing output; and C{2} or higher will generate verbose tracing output. @type kwargs: C{dictionary} @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()} @rtype: C{ElementTree._ElementInterface} @return: Contents of toolbox data parsed according to the rules in grammar """ from nltk import chunk from nltk.parse import Tree cp = chunk.RegexpParser(grammar, top_node=top_node, trace=trace) db = self.parse(**kwargs) tb_etree = Element('toolbox_data') header = db.find('header') tb_etree.append(header) for record in db.findall('record'): parsed = cp.parse([(elem.text, elem.tag) for elem in record]) tb_etree.append(self._tree2etree(parsed)) return tb_etree
def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file parsed according to the chunk grammar. :type grammar: str :param grammar: Contains the chunking rules used to parse the database. See ``chunk.RegExp`` for documentation. :type root_label: str :param root_label: The node value that should be used for the top node of the chunk structure. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. :type kwargs: dict :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()`` :rtype: ElementTree._ElementInterface """ from nltk import chunk from nltk.tree import Tree cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace) db = self.parse(**kwargs) tb_etree = Element("toolbox_data") header = db.find("header") tb_etree.append(header) for record in db.findall("record"): parsed = cp.parse([(elem.text, elem.tag) for elem in record]) tb_etree.append(self._tree2etree(parsed)) return tb_etree
def chunk_parse(self, grammar, no_blanks=True, incomplete='record', **kwargs): """ Returns an element tree structure corresponding to a toolbox data file parsed according to the chunk grammar. @type grammar: string @param grammar: Contains the chunking rules used to parse the database. See L{chunk.RegExp} for documentation. @type no_blanks: boolean @param no_blanks: blank fields that are not important to the structure are deleted @type kwargs: keyword arguments dictionary @param incomplete: name of element used if parse doesn't result in one toplevel element @rtype: string @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()} @rtype: ElementTree._ElementInterface @return: Contents of toolbox data parsed according to the rules in grammar """ from nltk import chunk from nltk.parse import Tree cp = chunk.RegexpParser(grammar) db = self.parse(**kwargs) tb_etree = Element('toolbox_data') header = db.find('header') tb_etree.append(header) for record in db.findall('record'): parsed = cp.parse([(elem.text, elem.tag) for elem in record]) top = parsed[0] if not isinstance(top, Tree) or len(parsed) != 1: # didn't get a full parse parsed.node = incomplete top = parsed tb_etree.append(self._tree2etree(top, no_blanks)) return tb_etree
def demo(): """ A demonstration for the C{RegexpChunkParser} class. A single text is parsed with four different chunk parsers, using a variety of rules and strategies. """ from nltk import chunk, Tree text = """\ [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. """ print '*' * 75 print 'Evaluation text:' print text print '*' * 75 print grammar = r""" NP: # NP stage {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns {<NNP>+} # chunk proper nouns """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods <DT|JJ>{}<NN.*> # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) grammar = r""" NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns VP: {<TO>?<VB.*>} # VP = verb words """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{<DT> # separate on determiners PP: {<IN><NP>} # PP = preposition + noun phrase VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) # Evaluation from nltk.corpus import conll2000 print print "Demonstration of empty grammar:" cp = chunk.RegexpParser("") print chunk.accuracy( cp, conll2000.chunked_sents('test.txt', chunk_types=('NP', ))) print print "Demonstration of accuracy evaluation using CoNLL tags:" grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods <DT|JJ>{}<NN.*> # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) print chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]) print print "Demonstration of tagged token input" grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{<DT> # separate on determiners PP: {<IN><NP>} # PP = preposition + noun phrase VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) print cp.parse([("the", "DT"), ("little", "JJ"), ("cat", "NN"), ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"), (".", ".")])
from nltk import chunk from nltk.corpus import conll2000 cp = chunk.RegexpParser("NP: {<DT><NN>}") print chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP', ))) gold_tree = conll2000.chunked_sents('train.txt', chunk_types=('NP', ))[1] print gold_tree print cp.parse(gold_tree.flatten())