示例#1
0
    def test_count_not_tokenized_yet(self):
        s = pd.Series("a b c c")
        s_true = pd.Series([[1, 1, 2]])

        with warnings.catch_warnings():  # avoid print warning
            warnings.simplefilter("ignore")
            self.assertEqual(representation.count(s), s_true)

        with self.assertWarns(DeprecationWarning):  # check raise warning
            representation.count(s)
示例#2
0
    ],
    ["remove_angle_brackets", preprocessing.remove_angle_brackets, (s_text, )],
    ["remove_brackets", preprocessing.remove_brackets, (s_text, )],
    ["remove_html_tags", preprocessing.remove_html_tags, (s_text, )],
    ["tokenize", preprocessing.tokenize, (s_text, )],
    ["phrases", preprocessing.phrases, (s_tokenized_lists, )],
    ["replace_urls", preprocessing.replace_urls, (s_text, "")],
    ["remove_urls", preprocessing.remove_urls, (s_text, )],
    ["replace_tags", preprocessing.replace_tags, (s_text, "")],
    ["remove_tags", preprocessing.remove_tags, (s_text, )],
]

test_cases_representation = [
    [
        "count",
        lambda x: representation.flatten(representation.count(x)),
        (s_tokenized_lists, ),
    ],
    [
        "term_frequency",
        lambda x: representation.flatten(representation.term_frequency(x)),
        (s_tokenized_lists, ),
    ],
    [
        "tfidf",
        lambda x: representation.flatten(representation.tfidf(x)),
        (s_tokenized_lists, ),
    ],
    ["pca", representation.pca, (s_numeric_lists, 0)],
    ["nmf", representation.nmf, (s_numeric_lists, )],
    ["tsne", representation.tsne, (s_numeric_lists, )],
示例#3
0
 def test_count_punctuation_are_kept(self):
     s = pd.Series(["one !"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1]])
     self.assertEqual(representation.count(s), s_true)
示例#4
0
 def test_count_not_lowercase(self):
     s = pd.Series(["one ONE"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1]])
     self.assertEqual(representation.count(s), s_true)
示例#5
0
 def test_count_multiple_documents(self):
     s = pd.Series(["doc_one", "doc_two"])
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 0], [0, 1]])
     self.assertEqual(representation.count(s), s_true)
示例#6
0
 def test_count_single_document(self):
     s = pd.Series("a b c c")
     s = preprocessing.tokenize(s)
     s_true = pd.Series([[1, 1, 2]])
     self.assertEqual(representation.count(s), s_true)