def test_vectorizer_empty_token_case(): """ We ignore empty tokens right now but sklearn treats them as a character we might want to look into this more but this should not be a concern for most piplines """ corpus = [ "a b ", ] # we have extra null token here # we slightly diverge from sklearn here as not treating it as a token res = CountVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkCountVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) cp.testing.assert_array_equal(res.todense(), ref.toarray()) res = HashingVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkHashVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer_norm(norm): if norm not in ["l1", "l2", None]: with pytest.raises(ValueError): res = HashingVectorizer(norm=norm).fit_transform(DOCS_GPU) else: res = HashingVectorizer(norm=norm).fit_transform(DOCS_GPU) ref = SkHashVect(norm=norm).fit_transform(DOCS) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer_n_features(): n_features = 10 res = ( HashingVectorizer(n_features=n_features) .fit_transform(DOCS_GPU).todense().get() ) ref = SkHashVect(n_features=n_features).fit_transform(DOCS).toarray() assert res.shape == ref.shape
def test_hashingvectorizer_lowercase(lowercase): corpus = [ "This Is DoC", "this DoC is the second DoC.", "And this document is the third one.", "and Is this the first document?", ] res = HashingVectorizer(lowercase=lowercase).fit_transform(Series(corpus)) ref = SkHashVect(lowercase=lowercase).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer(): corpus = [ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ] res = HashingVectorizer().fit_transform(Series(corpus)) ref = SkHashVect().fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer_delimiter(): corpus = ["a0b0c", "a 0 b0e", "c0d0f"] res = HashingVectorizer( delimiter="0", norm=None, preprocessor=lambda s: s ).fit_transform(Series(corpus)) # equivalent logic for sklearn ref = SkHashVect( tokenizer=lambda s: s.split("0"), norm=None, token_pattern=None, preprocessor=lambda s: s, ).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer_stop_word(): ref = SkHashVect(stop_words="english").fit_transform(DOCS) res = HashingVectorizer(stop_words="english").fit_transform(DOCS_GPU) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())