def test_word_analyzer_unigrams_and_bigrams(): wa = CountVectorizer(analyzer="word", strip_accents="unicode", ngram_range=(1, 2)).build_analyzer() text = "J'ai mangé du kangourou ce midi, c'était pas très bon." expected = [ "ai", "mange", "du", "kangourou", "ce", "midi", "etait", "pas", "tres", "bon", "ai mange", "mange du", "du kangourou", "kangourou ce", "ce midi", "midi etait", "etait pas", "pas tres", "tres bon", ] assert wa(text) == expected
def test_unicode_decode_error(): # decode_error default to strict, so this should fail # First, encode (as bytes) a unicode string. text = "J'ai mangé du kangourou ce midi, c'était pas très bon." text_bytes = text.encode("utf-8") # Then let the Analyzer try to decode it as ascii. It should fail, # because we have given it an incorrect encoding. wa = CountVectorizer(ngram_range=(1, 2), encoding="ascii").build_analyzer() with pytest.raises(UnicodeDecodeError): wa(text_bytes) ca = CountVectorizer(analyzer="char", ngram_range=(3, 6), encoding="ascii").build_analyzer() with pytest.raises(UnicodeDecodeError): ca(text_bytes)
def test_vectorizer_unicode(): # tests that the count vectorizer works with cyrillic. document = ("Машинное обучение — обширный подраздел искусственного " "интеллекта, изучающий методы построения алгоритмов, " "способных обучаться.") vect = CountVectorizer() X_counted = vect.fit_transform([document]) assert X_counted.shape == (1, 12) # No collisions on such a small dataset assert X_counted.nnz == X_hashed.nnz # When norm is None and not alternate_sign, the tokens are counted up to # collisions assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
def test_count_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # label junk food as -1, the others as +1 target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS) # split the dataset for model development and final evaluation train_data, test_data, target_train, target_test = train_test_split( data, target, test_size=0.2, random_state=0) pipeline = Pipeline([("vect", CountVectorizer()), ("svc", LinearSVC())]) parameters = { "vect__ngram_range": [(1, 1), (1, 2)], "svc__loss": ("hinge", "squared_hinge"), } # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, cv=3) # Check that the best model found by grid search is 100% correct on the # held out evaluation set. pred = grid_search.fit(train_data, target_train).predict(test_data) assert_array_equal(pred, target_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accuracy models assert grid_search.best_score_ == 1.0 best_vectorizer = grid_search.best_estimator_.named_steps["vect"] assert best_vectorizer.ngram_range == (1, 1)
def test_countvectorizer_custom_vocabulary_pipeline(): what_we_like = ["pizza", "beer"] pipe = Pipeline([ ("count", CountVectorizer(vocabulary=what_we_like)), ("tfidf", TfidfTransformer()), ]) X = pipe.fit_transform(ALL_FOOD_DOCS) assert set(pipe.named_steps["count"].vocabulary_) == set(what_we_like) assert X.shape[1] == len(what_we_like)
def test_char_wb_ngram_analyzer(): cnga = CountVectorizer(analyzer="char_wb", strip_accents="unicode", ngram_range=(3, 6)).build_analyzer() text = "This \n\tis a test, really.\n\n I met Harry yesterday" expected = [" th", "thi", "his", "is ", " thi"] assert cnga(text)[:5] == expected expected = ["yester", "esterd", "sterda", "terday", "erday "] assert cnga(text)[-5:] == expected cnga = CountVectorizer(input="file", analyzer="char_wb", ngram_range=(3, 6)).build_analyzer() text = StringIO("A test with a file-like object!") expected = [" a ", " te", "tes", "est", "st ", " tes"] assert cnga(text)[:6] == expected
def test_pickling_vectorizer(): instances = [ CountVectorizer(), CountVectorizer(preprocessor=strip_tags), CountVectorizer(analyzer=lazy_analyze), CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS), ] for orig in instances: s = pickle.dumps(orig) copy = pickle.loads(s) assert type(copy) == orig.__class__ assert copy.get_params() == orig.get_params() assert_array_equal( copy.fit_transform(JUNK_FOOD_DOCS).toarray(), orig.fit_transform(JUNK_FOOD_DOCS).toarray(), )
def test_stop_words_removal(): # Ensure that deleting the stop_words_ attribute doesn't affect transform fitted_vectorizers = ( CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS), ) for vect in fitted_vectorizers: vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray() vect.stop_words_ = None stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray() delattr(vect, "stop_words_") stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray() assert_array_equal(stop_None_transform, vect_transform) assert_array_equal(stop_del_transform, vect_transform)
def test_pickling_built_processors(factory): """Tokenizers cannot be pickled https://github.com/scikit-learn/scikit-learn/issues/12833 """ vec = CountVectorizer() function = factory(vec) text = "J'ai mangé du kangourou ce midi, " "c'était pas très bon." roundtripped_function = pickle.loads(pickle.dumps(function)) expected = function(text) result = roundtripped_function(text) assert result == expected
def test_vectorizer(): # raw documents as an iterator train_data = iter(ALL_FOOD_DOCS[:-1]) test_data = [ALL_FOOD_DOCS[-1]] n_train = len(ALL_FOOD_DOCS) - 1 # test without vocabulary v1 = CountVectorizer(max_df=0.5) counts_train = v1.fit_transform(train_data) if hasattr(counts_train, "tocsr"): counts_train = counts_train.tocsr() assert counts_train[0, v1.vocabulary_["pizza"]] == 2 # build a vectorizer v1 with the same vocabulary as the one fitted by v1 v2 = CountVectorizer(vocabulary=v1.vocabulary_) # compare that the two vectorizer give the same output on the test sample for v in (v1, v2): counts_test = v.transform(test_data) if hasattr(counts_test, "tocsr"): counts_test = counts_test.tocsr() vocabulary = v.vocabulary_ assert counts_test[0, vocabulary["salad"]] == 1 assert counts_test[0, vocabulary["tomato"]] == 1 assert counts_test[0, vocabulary["water"]] == 1 # stop word from the fixed list assert "the" not in vocabulary # stop word found automatically by the vectorizer DF thresholding # words that are high frequent across the complete corpus are likely # to be not informative (either real stop words of extraction # artifacts) assert "copyright" not in vocabulary # not present in the sample assert counts_test[0, vocabulary["coke"]] == 0 assert counts_test[0, vocabulary["burger"]] == 0 assert counts_test[0, vocabulary["beer"]] == 0 assert counts_test[0, vocabulary["pizza"]] == 0
def test_vectorizer_min_df(): test_data = ["abc", "dea", "eat"] vect = CountVectorizer(analyzer="char", min_df=1) vect.fit(test_data) assert "a" in vect.vocabulary_.keys() assert len(vect.vocabulary_.keys()) == 6 assert len(vect.stop_words_) == 0 vect.min_df = 2 vect.fit(test_data) assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored assert len(vect.vocabulary_.keys()) == 2 # {ae} remain assert "c" in vect.stop_words_ assert len(vect.stop_words_) == 4 vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 vect.fit(test_data) assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored assert len(vect.vocabulary_.keys()) == 1 # {a} remains assert "c" in vect.stop_words_ assert len(vect.stop_words_) == 5
def test_count_binary_occurrences(): # by default multiple occurrences are counted as longs test_data = ["aaabc", "abbde"] vect = CountVectorizer(analyzer="char", max_df=1.0) X = vect.fit_transform(test_data).toarray() assert_array_equal(["a", "b", "c", "d", "e"], vect.get_feature_names()) assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X) # using boolean features, we can fetch the binary occurrence info # instead. vect = CountVectorizer(analyzer="char", max_df=1.0, binary=True) X = vect.fit_transform(test_data).toarray() assert_array_equal([[1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], X) # check the ability to change the dtype vect = CountVectorizer(analyzer="char", max_df=1.0, binary=True, dtype=np.float32) X_sparse = vect.fit_transform(test_data) assert X_sparse.dtype == np.float32
def test_word_ngram_analyzer(): cnga = CountVectorizer(analyzer="word", strip_accents="unicode", ngram_range=(3, 6)).build_analyzer() text = "This \n\tis a test, really.\n\n I met Harry yesterday" expected = ["this is test", "is test really", "test really met"] assert cnga(text)[:3] == expected expected = [ "test really met harry yesterday", "this is test really met harry", "is test really met harry yesterday", ] assert cnga(text)[-3:] == expected cnga_file = CountVectorizer(input="file", analyzer="word", ngram_range=(3, 6)).build_analyzer() file = StringIO(text) assert cnga_file(file) == cnga(text)
def test_vectorizer_max_df(): test_data = ["abc", "dea", "eat"] vect = CountVectorizer(analyzer="char", max_df=1.0) vect.fit(test_data) assert "a" in vect.vocabulary_.keys() assert len(vect.vocabulary_.keys()) == 6 assert len(vect.stop_words_) == 0 vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5 vect.fit(test_data) assert "a" not in vect.vocabulary_.keys() # {ae} ignored assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain assert "a" in vect.stop_words_ assert len(vect.stop_words_) == 2 vect.max_df = 1 vect.fit(test_data) assert "a" not in vect.vocabulary_.keys() # {ae} ignored assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain assert "a" in vect.stop_words_ assert len(vect.stop_words_) == 2
def test_countvectorizer_vocab_sets_when_pickling(): # ensure that vocabulary of type set is coerced to a list to # preserve iteration ordering after deserialization rng = np.random.RandomState(0) vocab_words = np.array([ "beer", "burger", "celeri", "coke", "pizza", "salad", "sparkling", "tomato", "water", ]) for x in range(0, 100): vocab_set = set(rng.choice(vocab_words, size=5, replace=False)) cv = CountVectorizer(vocabulary=vocab_set) unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) assert cv.get_feature_names() == unpickled_cv.get_feature_names()
def test_countvectorizer_empty_vocabulary(): try: vect = CountVectorizer(vocabulary=[]) vect.fit(["foo"]) assert False, "we shouldn't get here" except ValueError as e: assert "empty vocabulary" in str(e).lower() try: v = CountVectorizer(max_df=1.0, stop_words="english") # fit on stopwords only v.fit(["to be or not to be", "and me too", "and so do you"]) assert False, "we shouldn't get here" except ValueError as e: assert "empty vocabulary" in str(e).lower()
def test_char_ngram_analyzer(): cnga = CountVectorizer(analyzer="char", strip_accents="unicode", ngram_range=(3, 6)).build_analyzer() text = "J'ai mangé du kangourou ce midi, c'était pas très bon" expected = ["j'a", "'ai", "ai ", "i m", " ma"] assert cnga(text)[:5] == expected expected = ["s tres", " tres ", "tres b", "res bo", "es bon"] assert cnga(text)[-5:] == expected text = "This \n\tis a test, really.\n\n I met Harry yesterday" expected = ["thi", "his", "is ", "s i", " is"] assert cnga(text)[:5] == expected expected = [" yeste", "yester", "esterd", "sterda", "terday"] assert cnga(text)[-5:] == expected cnga = CountVectorizer(input="file", analyzer="char", ngram_range=(3, 6)).build_analyzer() text = StringIO("This is a test with a file-like object!") expected = ["thi", "his", "is ", "s i", " is"] assert cnga(text)[:5] == expected
def test_countvectorizer_vocab_dicts_when_pickling(): rng = np.random.RandomState(0) vocab_words = np.array([ "beer", "burger", "celeri", "coke", "pizza", "salad", "sparkling", "tomato", "water", ]) for x in range(0, 100): vocab_dict = dict() words = rng.choice(vocab_words, size=5, replace=False) for y in range(0, 5): vocab_dict[words[y]] = y cv = CountVectorizer(vocabulary=vocab_dict) unpickled_cv = pickle.loads(pickle.dumps(cv)) cv.fit(ALL_FOOD_DOCS) unpickled_cv.fit(ALL_FOOD_DOCS) assert cv.get_feature_names() == unpickled_cv.get_feature_names()
def test_countvectorizer_custom_vocabulary(): vocab = {"pizza": 0, "beer": 1} terms = set(vocab.keys()) # Try a few of the supported types. for typ in [dict, list, iter, partial(defaultdict, int)]: v = typ(vocab) vect = CountVectorizer(vocabulary=v) vect.fit(JUNK_FOOD_DOCS) if isinstance(v, Mapping): assert vect.vocabulary_ == vocab else: assert set(vect.vocabulary_) == terms X = vect.transform(JUNK_FOOD_DOCS) assert X.shape[1] == len(terms) v = typ(vocab) vect = CountVectorizer(vocabulary=v) inv = vect.inverse_transform(X) assert len(inv) == X.shape[0]
def test_vectorizer_stop_words_inconsistent(): lstr = "['and', 'll', 've']" message = ("Your stop_words may be inconsistent with your " "preprocessing. Tokenizing the stop words generated " "tokens %s not in stop_words." % lstr) for vec in [CountVectorizer()]: vec.set_params(stop_words=["you've", "you", "you'll", "AND"]) assert_warns_message(UserWarning, message, vec.fit_transform, ["hello world"]) # reset stop word validation del vec._stop_words_id assert _check_stop_words_consistency(vec) is False # Only one warning per stop list assert_no_warnings(vec.fit_transform, ["hello world"]) assert _check_stop_words_consistency(vec) is None # Test caching of inconsistency assessment vec.set_params(stop_words=["you've", "you", "you'll", "blah", "AND"]) assert_warns_message(UserWarning, message, vec.fit_transform, ["hello world"])
def test_countvectorizer_sort_features_64bit_sparse_indices(): """ Check that CountVectorizer._sort_features preserves the dtype of its sparse feature matrix. This test is skipped on 32bit platforms, see: https://github.com/scikit-learn/scikit-learn/pull/11295 for more details. """ X = sparse.csr_matrix((5, 5), dtype=np.int64) # force indices and indptr to int64. INDICES_DTYPE = np.int64 X.indices = X.indices.astype(INDICES_DTYPE) X.indptr = X.indptr.astype(INDICES_DTYPE) vocabulary = {"scikit-learn": 0, "is": 1, "great!": 2} Xs = CountVectorizer()._sort_features(X, vocabulary) assert INDICES_DTYPE == Xs.indices.dtype
def test_non_unique_vocab(): vocab = ["a", "b", "c", "a", "a"] vect = CountVectorizer(vocabulary=vocab) with pytest.raises(ValueError): vect.fit([])
def test_countvectorizer_custom_vocabulary_repeated_indices(): vocab = {"pizza": 0, "beer": 0} try: CountVectorizer(vocabulary=vocab) except ValueError as e: assert "vocabulary contains repeated indices" in str(e).lower()
def test_countvectorizer_custom_vocabulary_gap_index(): vocab = {"pizza": 1, "beer": 2} try: CountVectorizer(vocabulary=vocab) except ValueError as e: assert "doesn't contain index" in str(e).lower()
def test_countvectorizer_stop_words(): cv = CountVectorizer() cv.set_params(stop_words="english") assert cv.get_stop_words() == ENGLISH_STOP_WORDS cv.set_params(stop_words="_bad_str_stop_") with pytest.raises(ValueError): cv.get_stop_words() cv.set_params(stop_words="_bad_unicode_stop_") with pytest.raises(ValueError): cv.get_stop_words() stoplist = ["some", "other", "words"] cv.set_params(stop_words=stoplist) assert cv.get_stop_words() == set(stoplist)
def test_fit_countvectorizer_twice(): cv = CountVectorizer() X1 = cv.fit_transform(ALL_FOOD_DOCS[:5]) X2 = cv.fit_transform(ALL_FOOD_DOCS[5:]) assert X1.shape[1] != X2.shape[1]
def test_count_vectorizer_max_features(): # Regression test: max_features didn't work correctly in 0.14. cv_1 = CountVectorizer(max_features=1) cv_3 = CountVectorizer(max_features=3) cv_None = CountVectorizer(max_features=None) counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS).sum(axis=0) counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS).sum(axis=0) counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS).sum(axis=0) features_1 = cv_1.get_feature_names() features_3 = cv_3.get_feature_names() features_None = cv_None.get_feature_names() # The most common feature is "the", with frequency 7. assert 7 == counts_1.max() assert 7 == counts_3.max() assert 7 == counts_None.max() # The most common feature should be the same assert "the" == features_1[np.argmax(counts_1)] assert "the" == features_3[np.argmax(counts_3)] assert "the" == features_None[np.argmax(counts_None)]
def test_feature_names(): cv = CountVectorizer(max_df=0.5) # test for Value error on unfitted/empty vocabulary with pytest.raises(ValueError): cv.get_feature_names() assert not cv.fixed_vocabulary_ # test for vocabulary learned from data X = cv.fit_transform(ALL_FOOD_DOCS) n_samples, n_features = X.shape assert len(cv.vocabulary_) == n_features feature_names = cv.get_feature_names() assert len(feature_names) == n_features assert_array_equal( [ "beer", "burger", "celeri", "coke", "pizza", "salad", "sparkling", "tomato", "water", ], feature_names, ) for idx, name in enumerate(feature_names): assert idx == cv.vocabulary_.get(name) # test for custom vocabulary vocab = [ "beer", "burger", "celeri", "coke", "pizza", "salad", "sparkling", "tomato", "water", ] cv = CountVectorizer(vocabulary=vocab) feature_names = cv.get_feature_names() assert_array_equal( [ "beer", "burger", "celeri", "coke", "pizza", "salad", "sparkling", "tomato", "water", ], feature_names, ) assert cv.fixed_vocabulary_ for idx, name in enumerate(feature_names): assert idx == cv.vocabulary_.get(name)
vect.fit([]) @pytest.mark.parametrize("Vectorizer", (CountVectorizer, )) def test_vectorizer_string_object_as_input(Vectorizer): message = "Iterable over raw text documents expected, " "string object received." vec = Vectorizer() assert_raise_message(ValueError, message, vec.fit_transform, "hello world!") assert_raise_message(ValueError, message, vec.fit, "hello world!") vec.fit(["some text", "some other text"]) assert_raise_message(ValueError, message, vec.transform, "hello world!") @pytest.mark.parametrize("vec", [ CountVectorizer(ngram_range=(2, 1)), ]) def test_vectorizers_invalid_ngram_range(vec): # vectorizers could be initialized with invalid ngram range # test for raising error message invalid_range = vec.ngram_range message = ("Invalid value for ngram_range=%s " "lower boundary larger than the upper boundary." % str(invalid_range)) assert_raise_message(ValueError, message, vec.fit, ["good news everyone"]) assert_raise_message(ValueError, message, vec.fit_transform, ["good news everyone"]) def _check_stop_words_consistency(estimator):