def test_category_to_corpus_separated(self): """ Test separated mode, which puts documents in the same category into different corpora """ category_to_corpus = CategoryToCorpus(None, None, "categories", None, "separated") pipeline = GeneratorPipeline([category_to_corpus]) docs = pipeline.process(self.docs) # TODO: add a sink module or something similar to thread all docs/words # through a pipeline for doc in docs: continue # Get the generated corpus corpora = category_to_corpus.post_process() self.assertEqual(len(corpora), 2) self.assertEqual(len(corpora["stopwords"]), 2) self.assertEqual(len(corpora["parsing"]), 1)
def test_category_to_corpus_combined(self): """ Test combined mode, which concatenates documents in the same category into a single document """ category_to_corpus = CategoryToCorpus() pipeline = GeneratorPipeline([category_to_corpus]) docs = pipeline.process(self.docs) # TODO: add a sink module or something similar to thread all docs/words # through a pipeline for doc in docs: continue # Get the generated corpus corpus = category_to_corpus.post_process() self.assertEqual(len(corpus.categories()), 2) stopwords_docs = corpus["stopwords"] parsing_docs = corpus["parsing"] word_list = list(stopwords_docs.words()) self.assertEqual(len(word_list), 12) self.assertEqual(word_list[0], "a") self.assertEqual(word_list[1], "stopword") self.assertEqual(word_list[2], "test") self.assertEqual(word_list[3], ".") self.assertEqual(word_list[4], "a") self.assertEqual(word_list[5], "stopword") self.assertEqual(word_list[6], "test") self.assertEqual(word_list[7], "and") self.assertEqual(word_list[8], "a") self.assertEqual(word_list[9], "unique") self.assertEqual(word_list[10], "test") self.assertEqual(word_list[11], ".") word_list = list(parsing_docs.words()) self.assertEqual(len(word_list), 4) self.assertEqual(word_list[0], "a") self.assertEqual(word_list[1], "parsing") self.assertEqual(word_list[2], "test") self.assertEqual(word_list[3], ".")