예제 #1
0
	def test_keep_only_these_categories(self):
		df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
		                  columns=['category', 'text'])
		corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
		hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
		self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
		self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
		with self.assertRaises(AssertionError):
			corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
		corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
예제 #2
0
 def test_keep_only_these_categories(self):
     df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                       columns=['category', 'text'])
     corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
     hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift'])
     self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift'])
     self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms()))
     with self.assertRaises(AssertionError):
         corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'])
     corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
	def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes'])
예제 #4
0
    def test_remove_categories(self):
        df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text',
                                  nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal([
            i for i in corpus._y if i != corpus.get_categories().index('swift')
        ], swiftless._y)
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())
예제 #5
0
    def test_remove_categories(self):
        df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T,
                          columns=['category', 'text'])
        corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build()
        swiftless = corpus.remove_categories(['swift'])

        swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'],
                                                 'category',
                                                 'text',
                                                 nlp=whitespace_nlp).build()
        np.testing.assert_equal(
            [i for i in corpus._y if i != corpus.get_categories().index('swift')],
            swiftless._y
        )
        self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0])
        self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape)
        self.assertEqual(set(swiftless_constructed.get_terms()),
                         set(swiftless.get_terms()))
        pd.testing.assert_series_equal(swiftless_constructed.get_texts(),
                                       swiftless.get_texts())