def readCorpusList(self): reader = ChunkedCorpusReader('C:/nltk_data/corpora/cookbook', r'.*\.chunk') for chunk in reader.chunked_words(): try: print(chunk.leaves()) except: print(chunk)
def chunked_paras(self, fileids=None, categories=None): return ChunkedCorpusReader.chunked_paras( self, self._resolve(fileids, categories))
def tagged_sents(self, fileids=None, categories=None): return ChunkedCorpusReader.tagged_sents( self, self._resolve(fileids, categories))
def words(self, fileids=None, categories=None): return ChunkedCorpusReader.words(self, self._resolve(fileids, categories))
def __init__(self, *args, **kwargs): CategorizedCorpusReader.__init__(self, kwargs) ChunkedCorpusReader.__init__(self, *args, **kwargs)
def sents(self, fileids=None, categories=None): return ChunkedCorpusReader.sents(self, self_resolve(fileids, categories))
tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal') ) ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK ## Reading chunk corpora ####### reader = ChunkedCorpusReader('/Users/atul/nltk_data', r'treebank.chunk', tagset='en-brown') print(reader.chunked_words()) ## Word level structure print(reader.chunked_sents()) ## Sentence level structure print(reader.chunked_paras()) ## Paragraph level structure ## Reading classifed corpora ################## ## classification extracted using cat_pattern (from file name), or cat_dict or cat_file ###### from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( '/Users/atul/nltk_data', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt' ) ## Easiest is to read files for different category reader.categories() reader.fileids(categories=['neg']) reader.fileids(categories=['pos'])
def tagged_sents(self, fileids=None, categories=None): return ChunkedCorpusReader.tagged_sents(self,self_resolve(fileids, categories))
def tagged_paras(self, fileids=None, categories=None, simplify_tags=False): return ChunkedCorpusReader.tagged_paras( self, self._resolve(fileids, categories), simplify_tags)
def sents(self, fileids=None, categories=None): return ChunkedCorpusReader.sents(self, self._resolve(fileids, categories))
########## CHUNKED CORPUS READER ############### ###Implementing CCR from nltk.corpus.reader import ChunkedCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" reader=ChunkedCorpusReader(root,r'.*\.chunk') #Each chunk-represented in braces is considered as a word print reader.chunked_words() #Each sentence will be included in a Tree() print reader.chunked_sents() print reader.chunked_paras() #Getting tagged tokens for each chunk (each chunk is a word but each word is not a chunk) print reader.chunked_words()[0].leaves() print reader.chunked_sents()[1].leaves() #Cant apply leaves directly to a para - but we can access a sentence of a given para. print reader.chunked_para()[0][0].leaves() ###Implementing CCCR from nltk.corpus.reader import ConllChunkCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" reader=ConllChunkCorpusReader(root,r'.*\.iob',('NP','VP'.'PP')) print reader.chunked_words() print reader.chunked_sents() print reader.iob_words() print reader.iob_sents()
from nltk.corpus.reader import ChunkedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = ChunkedCorpusReader(d, r'.*\.chunk') print(reader.chunked_words()) print(reader.chunked_sents()) print(reader.chunked_paras()) # reader.chunked_sents()[0].draw() print(reader.chunked_sents()[0].leaves())
def createChunker(): chunks = ChunkedCorpusReader('data/chunks/','text_search.pos') tagger_classes = [UnigramTagger, BigramTagger] train_chunks = chunks.chunked_sents() chunker = TagChunker(train_chunks, tagger_classes) return chunker
def __init__(self, directory="",fileids=r"haaretz.bgu",myEncoding="utf-8"): ChunkedCorpusReader.__init__(self, directory ,fileids , str2chunktree=self.__str2BguTree,sent_tokenizer=RegexpTokenizer('\n\n', gaps=True),encoding=myEncoding) self._format = format