from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format(category, n_docs, n_words))
from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') for category in reader.categories(): n_docs = len(reader.fileids(categories=[category])) n_words = sum(1 for word in reader.words(categories=[category])) print("- '{}' contains {:,} docs and {:,} words".format( category, n_docs, n_words))
from sklearn.model_selection import train_test_split as tts from reader import PickledCorpusReader reader = PickledCorpusReader('../corpus') labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"] docs = reader.fileids(categories=labels) X = list(reader.docs(fileids=docs)) y = [reader.categories(fileids=[fileid])[0] for fileid in docs]