r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal') ) ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK ## Reading chunk corpora ####### reader = ChunkedCorpusReader('/Users/atul/nltk_data', r'treebank.chunk', tagset='en-brown') print(reader.chunked_words()) ## Word level structure print(reader.chunked_sents()) ## Sentence level structure print(reader.chunked_paras()) ## Paragraph level structure ## Reading classifed corpora ################## ## classification extracted using cat_pattern (from file name), or cat_dict or cat_file ###### from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( '/Users/atul/nltk_data', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt' ) ## Easiest is to read files for different category reader.categories() reader.fileids(categories=['neg']) reader.fileids(categories=['pos']) reader.fileids()