from nltk.corpus.reader import WordListCorpusReader import nltk # print(nltk.data.find('corpora/cookbook')) # print(nltk.data.find('corpora/cookbook/wordlist.txt')) d = nltk.data.find('corpora/cookbook') reader = WordListCorpusReader(d, ['wordlist.txt']) print(reader.words()) print(reader.fileids())
def read_emails(self, path): # Get all files files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('DS_Store')]) except: pass reader = WordListCorpusReader(path, files) cleaner = Cleaner() emails = list() # Creates the Email Object out of each email file and appends to list for file_id in reader.fileids(): with open(path + file_id, 'r') as current_file: cleaned_contents = cleaner.clean_file(current_file.read()) split_email_header, split_email_body, split_email_file_id = self.divide( cleaned_contents, file_id) emails.append( Email(split_email_header, split_email_body, split_email_file_id)) # Return list of Email objects return emails
def read_emails(path): files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('.DS_Store')]) except: pass reader = WordListCorpusReader(path, files) text = clean(reader.raw()) emails = split_emails(text, reader.fileids()) return emails
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
''' draw tree ''' reader.chunked_sents()[0].draw() ''' get leaves ''' reader.chunked_words()[0].leaves() reader.chunked_sents()[0].leaves() reader.chunked_paras()[0][0].leaves() ''' categorized corpus ''' from nltk.corpus import brown brown.categories() from nltk.corpus.reader import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader(path + '/corpora/cookbook/', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') reader.categories() reader.fileids(categories=['neg']) reader.fileids(categories=['pos']) ''' using a categorized chunked corpus reader ''' #import nltk.data from catchunked import CategorizedChunkedCorpusReader path = nltk.data.find('corpora/treebank/tagged') reader = CategorizedChunkedCorpusReader(path, r'wsj_.*\.pos',cat_pattern=r'wsj_(.*)\.pos') #len(reader.categories()) == len(reader.fileids()) len(reader.chunked_sents(categories=['0001'])) ''' Lazy corpus loader ''' from nltk.corpus.util import LazyCorpusLoader #from nltk.corpus.reader import WordListCorpusReader reader = LazyCorpusLoader('cookbook', WordListCorpusReader,['wordlist'])
########## WORDLIST CORPUS READER ############### #Basic Corpus Reader from nltk.corpus.reader import WordListCorpusReader #List of a few thousand names organized by gender from nltk.corpus import names #List of english words from nltk.corpus import words nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" #nltkFile="mywords.txt" #source=nltkDir+nltkFile ### One File WordListCorpusReader reader=WordListCorpusReader(nltkDir,['wordlist.txt']) print reader.words() print reader.fileids() ### MultiFile WordListCorpusReader #To get the names of the files in the corpus use the "fileids" command names.fileids() print len(names.words('female.txt')) print len(names.words('female.txt')) words.fileids() print len(words.words('en-basic')) print len(words.words('en')) ###Chunked Corpus Reader