#import nltk.data nltk.data.load('corpora/cookbook/GL_Sequent.txt', format='raw') 'nltk\n' from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader(path + '/corpora/cookbook/', ['GL_Sequent.txt']) reader.words() ''' reading a tagged corpus ''' from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos') reader.words() reader.tagged_words() reader.sents() reader.tagged_sents() reader.paras() reader.tagged_paras() ''' different Tokenizer - works? ''' from nltk.tokenize import SpaceTokenizer reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos',word_tokenizer=SpaceTokenizer()) reader.words() ''' different Sentence Tokenizer ''' from nltk.tokenize import LineTokenizer reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos', sent_tokenizer=LineTokenizer()) reader.sents() ''' chunked Corpus Reader ''' from nltk.corpus.reader import ChunkedCorpusReader reader = ChunkedCorpusReader(path + '/corpora/cookbook/', r'.*\.chunk')