示例#1
1
from nltk.corpus.reader import WordListCorpusReader
import nltk

# print(nltk.data.find('corpora/cookbook'))
# print(nltk.data.find('corpora/cookbook/wordlist.txt'))

d = nltk.data.find('corpora/cookbook')
reader = WordListCorpusReader(d, ['wordlist.txt'])
print(reader.words())
print(reader.fileids())
示例#2
0
    def read_emails(self, path):
        # Get all files
        files = [f for f in listdir(path) if isfile(join(path, f))]

        try:
            del (files[files.index('DS_Store')])
        except:
            pass

        reader = WordListCorpusReader(path, files)

        cleaner = Cleaner()

        emails = list()

        # Creates the Email Object out of each email file and appends to list
        for file_id in reader.fileids():
            with open(path + file_id, 'r') as current_file:
                cleaned_contents = cleaner.clean_file(current_file.read())
                split_email_header, split_email_body, split_email_file_id = self.divide(
                    cleaned_contents, file_id)
                emails.append(
                    Email(split_email_header, split_email_body,
                          split_email_file_id))

        # Return list of Email objects
        return emails
示例#3
0
def read_emails(path):
    files = [f for f in listdir(path) if isfile(join(path, f))]

    try:
        del (files[files.index('.DS_Store')])
    except:
        pass

    reader = WordListCorpusReader(path, files)

    text = clean(reader.raw())
    emails = split_emails(text, reader.fileids())

    return emails
示例#4
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
''' draw tree '''
reader.chunked_sents()[0].draw()

''' get leaves '''
reader.chunked_words()[0].leaves()
reader.chunked_sents()[0].leaves()
reader.chunked_paras()[0][0].leaves()

''' categorized corpus '''
from nltk.corpus import brown
brown.categories()

from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader(path + '/corpora/cookbook/', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt')
reader.categories()
reader.fileids(categories=['neg'])
reader.fileids(categories=['pos'])


''' using a categorized chunked corpus reader '''
#import nltk.data
from catchunked import CategorizedChunkedCorpusReader
path = nltk.data.find('corpora/treebank/tagged')
reader = CategorizedChunkedCorpusReader(path, r'wsj_.*\.pos',cat_pattern=r'wsj_(.*)\.pos')
#len(reader.categories()) == len(reader.fileids())
len(reader.chunked_sents(categories=['0001']))

''' Lazy corpus loader '''
from nltk.corpus.util import LazyCorpusLoader
#from nltk.corpus.reader import WordListCorpusReader
reader = LazyCorpusLoader('cookbook', WordListCorpusReader,['wordlist'])
示例#6
0
########## WORDLIST CORPUS READER ###############

#Basic Corpus Reader
from nltk.corpus.reader import WordListCorpusReader
#List of a few thousand names organized by gender
from nltk.corpus import names
#List of english words
from nltk.corpus import words

nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
#nltkFile="mywords.txt"
#source=nltkDir+nltkFile

### One File WordListCorpusReader
reader=WordListCorpusReader(nltkDir,['wordlist.txt'])
print reader.words()
print reader.fileids()

### MultiFile WordListCorpusReader
#To get the names of the files in the corpus use the "fileids" command
names.fileids()
print len(names.words('female.txt'))
print len(names.words('female.txt'))

words.fileids()
print len(words.words('en-basic'))
print len(words.words('en'))

###Chunked Corpus Reader