''' note that this should be a path in the Git_Workspace on D:\ ''' ''' load a sample wordlist ''' #import nltk.data nltk.data.load('corpora/cookbook/GL_Sequent.txt', format='raw') 'nltk\n' from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader(path + '/corpora/cookbook/', ['GL_Sequent.txt']) reader.words() ''' reading a tagged corpus ''' from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos') reader.words() reader.tagged_words() reader.sents() reader.tagged_sents() reader.paras() reader.tagged_paras() ''' different Tokenizer - works? ''' from nltk.tokenize import SpaceTokenizer reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos',word_tokenizer=SpaceTokenizer()) reader.words() ''' different Sentence Tokenizer ''' from nltk.tokenize import LineTokenizer reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos', sent_tokenizer=LineTokenizer()) reader.sents()