def test_keep_only_these_categories(self): df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T, columns=['category', 'text']) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift']) self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift']) self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms())) with self.assertRaises(AssertionError): corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd']) corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
def test_keep_only_these_categories(self): df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T, columns=['category', 'text']) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift']) self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift']) self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms())) with self.assertRaises(AssertionError): corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd']) corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
def test_build(self): from sklearn.feature_extraction.text import CountVectorizer categories, docs = get_docs_categories_semiotic() idx_store = IndexStore() y = np.array([idx_store.getidx(c) for c in categories]) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(docs) term_doc_mat = TermDocMatrixFromScikit( X=X_counts, y=y, feature_vocabulary=count_vectorizer.vocabulary_, category_names=idx_store.values()).build() self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly']) self.assertEqual(term_doc_mat .get_term_freq_df() .assign(score=term_doc_mat.get_scaled_f_scores('hamlet')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['that', 'march', 'did', 'majesty', 'sometimes'])
def test_remove_categories(self): df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T, columns=['category', 'text']) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() swiftless = corpus.remove_categories(['swift']) swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'], 'category', 'text', nlp=whitespace_nlp).build() np.testing.assert_equal([ i for i in corpus._y if i != corpus.get_categories().index('swift') ], swiftless._y) self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0]) self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape) self.assertEqual(set(swiftless_constructed.get_terms()), set(swiftless.get_terms())) pd.testing.assert_series_equal(swiftless_constructed.get_texts(), swiftless.get_texts())
def test_remove_categories(self): df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T, columns=['category', 'text']) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() swiftless = corpus.remove_categories(['swift']) swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'], 'category', 'text', nlp=whitespace_nlp).build() np.testing.assert_equal( [i for i in corpus._y if i != corpus.get_categories().index('swift')], swiftless._y ) self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0]) self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape) self.assertEqual(set(swiftless_constructed.get_terms()), set(swiftless.get_terms())) pd.testing.assert_series_equal(swiftless_constructed.get_texts(), swiftless.get_texts())