print("File IDs:", files) print("Number of files:", len(files)) print(reader.words(files[0])) print(reader.sents(files[0])) ## Reading tagged corpora ##################### reader = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal') ) ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK ## Reading chunk corpora ####### reader = ChunkedCorpusReader('/Users/atul/nltk_data', r'treebank.chunk', tagset='en-brown') print(reader.chunked_words()) ## Word level structure print(reader.chunked_sents()) ## Sentence level structure print(reader.chunked_paras()) ## Paragraph level structure ## Reading classifed corpora ################## ## classification extracted using cat_pattern (from file name), or cat_dict or cat_file ######
from TreeTaggerWrapper import treetaggerwrapper #1) build a TreeTagger wrapper: tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR='D:/Programme/TreeTagger') #2) tag your text. tags = tagger.TagText("This is a very short text to tag.") #3) use the tags list... (list of string output from TreeTagger). print tags # Check, whether the format of the tagged postings is good for the tagged corpus reader # p.51 NLTK Cookbook input_directory = path from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader(input_directory, r'.*\.txt') reader.words() # ['The', 'expense', 'and', 'time', 'involved', 'are', ...] reader.tagged_words() reader.sents() reader.tagged_sents() reader.paras() reader.tagged_paras() # testing the import # #import sys import myMath print myMath.add(4,5) print myMath.division(4, 2) print myMath.multiply(10, 5) print myMath.fibonacci(8) print myMath.squareroot(48)