def chunked_sents(self, fileids=None, categories=None): return ChunkedCorpusReader.chunked_sents( self, self._resolve(fileids, categories))
from nltk.corpus.reader import ChunkedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = ChunkedCorpusReader(d, r'.*\.chunk') print(reader.chunked_words()) print(reader.chunked_sents()) print(reader.chunked_paras()) # reader.chunked_sents()[0].draw() print(reader.chunked_sents()[0].leaves())
########## CHUNKED CORPUS READER ############### ###Implementing CCR from nltk.corpus.reader import ChunkedCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" reader=ChunkedCorpusReader(root,r'.*\.chunk') #Each chunk-represented in braces is considered as a word print reader.chunked_words() #Each sentence will be included in a Tree() print reader.chunked_sents() print reader.chunked_paras() #Getting tagged tokens for each chunk (each chunk is a word but each word is not a chunk) print reader.chunked_words()[0].leaves() print reader.chunked_sents()[1].leaves() #Cant apply leaves directly to a para - but we can access a sentence of a given para. print reader.chunked_para()[0][0].leaves() ###Implementing CCCR from nltk.corpus.reader import ConllChunkCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" reader=ConllChunkCorpusReader(root,r'.*\.iob',('NP','VP'.'PP')) print reader.chunked_words() print reader.chunked_sents() print reader.iob_words() print reader.iob_sents()
def createChunker(): chunks = ChunkedCorpusReader('data/chunks/','text_search.pos') tagger_classes = [UnigramTagger, BigramTagger] train_chunks = chunks.chunked_sents() chunker = TagChunker(train_chunks, tagger_classes) return chunker