Пример #1
0
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))
Пример #2
0
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

for category in reader.categories():

    n_docs = len(reader.fileids(categories=[category]))
    n_words = sum(1 for word in reader.words(categories=[category]))

    print("- '{}' contains {:,} docs and {:,} words".format(
        category, n_docs, n_words))
Пример #3
0
from sklearn.model_selection import train_test_split as tts
from reader import PickledCorpusReader

reader = PickledCorpusReader('../corpus')

labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"]
docs = reader.fileids(categories=labels)
X = list(reader.docs(fileids=docs))
y = [reader.categories(fileids=[fileid])[0] for fileid in docs]