def test_idf_not_tokenized_yet(self): s = pd.Series("a") s_true = pd.Series([[1]]) with warnings.catch_warnings(): # avoid print warning warnings.simplefilter("ignore") self.assertEqual(representation.tfidf(s), s_true) with self.assertWarns(DeprecationWarning): # check raise warning representation.tfidf(s)
def test_idf_single_not_lowercase(self): tfidf_single_smooth = 0.7071067811865475 # TODO s = pd.Series("ONE one") s = preprocessing.tokenize(s) s_true = pd.Series([[tfidf_single_smooth, tfidf_single_smooth]]) self.assertEqual(representation.tfidf(s), s_true)
def test_tfidf_formula(self): s = pd.Series(["Hi Bye", "Test Bye Bye"]) s = preprocessing.tokenize(s) s_true_index = pd.MultiIndex.from_tuples([(0, "Bye"), (0, "Hi"), (1, "Bye"), (1, "Test")], ) s_true = pd.Series([_tfidf(x[1], s, x[0]) for x in s_true_index], index=s_true_index).astype("Sparse") self.assertEqual(representation.tfidf(s), s_true)
def test_tfidf_formula(self): s = pd.Series(["Hi Bye", "Test Bye Bye"]) s = preprocessing.tokenize(s) s_true = pd.Series([ [ 1.0 * (math.log(3 / 3) + 1), 1.0 * (math.log(3 / 2) + 1), 0.0 * (math.log(3 / 2) + 1), ], [ 2.0 * (math.log(3 / 3) + 1), 0.0 * (math.log(3 / 2) + 1), 1.0 * (math.log(3 / 2) + 1), ], ]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s), s_true)
def test_idf_single_lowercase(self): s = pd.Series("ONE one") s_true = pd.Series([[1.0]]) self.assertEqual(representation.tfidf(s, lowercase=True), s_true)
def test_idf_single_document(self): s = pd.Series("a") s_true = pd.Series([[1]]) self.assertEqual(representation.tfidf(s), s_true)
def test_tfidf_single_document(self): s = pd.Series("a", index=["yo"]) s = preprocessing.tokenize(s) s_true = pd.Series([[1]], index=["yo"]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s), s_true)
def test_tfidf_max_df(self): s = pd.Series([["one"], ["one", "two"]]) s_true = pd.Series([[0.0], [1.4054651081081644]]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s, max_df=1), s_true)
def test_tfidf_min_df(self): s = pd.Series([["one"], ["one", "two"]]) s_true = pd.Series([[1.0], [1.0]]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s, min_df=2), s_true)
def test_tfidf_max_features(self): s = pd.Series("one one two") s = preprocessing.tokenize(s) s_true = pd.Series([[2.0]]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s, max_features=1), s_true)
def test_tfidf_single_not_lowercase(self): s = pd.Series("ONE one") s = preprocessing.tokenize(s) s_true = pd.Series([[1.0, 1.0]]) s_true.rename_axis("document", inplace=True) self.assertEqual(representation.tfidf(s), s_true)
def test_idf_not_tokenized_yet(self): s = pd.Series("a") s_true = pd.Series([[1]]) self.assertEqual(representation.tfidf(s), s_true)
] test_cases_representation = [ [ "count", lambda x: representation.flatten(representation.count(x)), (s_tokenized_lists, ), ], [ "term_frequency", lambda x: representation.flatten(representation.term_frequency(x)), (s_tokenized_lists, ), ], [ "tfidf", lambda x: representation.flatten(representation.tfidf(x)), (s_tokenized_lists, ), ], ["pca", representation.pca, (s_numeric_lists, 0)], ["nmf", representation.nmf, (s_numeric_lists, )], ["tsne", representation.tsne, (s_numeric_lists, )], ["kmeans", representation.kmeans, (s_numeric_lists, 1)], ["dbscan", representation.dbscan, (s_numeric_lists, )], ["meanshift", representation.meanshift, (s_numeric_lists, )], ] test_cases_visualization = [] test_cases = (test_cases_nlp + test_cases_preprocessing + test_cases_representation + test_cases_visualization)