def test_hashingvectorizer_alternate_sign(): # if alternate_sign = True # we should have some negative and positive values res = HashingVectorizer(alternate_sign=True).fit_transform(DOCS_GPU) res_f_array = res.todense().get().flatten() assert np.sum(res_f_array > 0, axis=0) > 0 assert np.sum(res_f_array < 0, axis=0) > 0 # if alternate_sign = False # we should have no negative values and some positive values res = HashingVectorizer(alternate_sign=False).fit_transform(DOCS_GPU) res_f_array = res.todense().get().flatten() assert np.sum(res_f_array > 0, axis=0) > 0 assert np.sum(res_f_array < 0, axis=0) == 0
def test_hashingvectorizer_norm(norm): if norm not in ["l1", "l2", None]: with pytest.raises(ValueError): res = HashingVectorizer(norm=norm).fit_transform(DOCS_GPU) else: res = HashingVectorizer(norm=norm).fit_transform(DOCS_GPU) ref = SkHashVect(norm=norm).fit_transform(DOCS) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer_lowercase(lowercase): corpus = [ "This Is DoC", "this DoC is the second DoC.", "And this document is the third one.", "and Is this the first document?", ] res = HashingVectorizer(lowercase=lowercase).fit_transform(Series(corpus)) ref = SkHashVect(lowercase=lowercase).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer(): corpus = [ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ] res = HashingVectorizer().fit_transform(Series(corpus)) ref = SkHashVect().fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer_delimiter(): corpus = ["a0b0c", "a 0 b0e", "c0d0f"] res = HashingVectorizer( delimiter="0", norm=None, preprocessor=lambda s: s ).fit_transform(Series(corpus)) # equivalent logic for sklearn ref = SkHashVect( tokenizer=lambda s: s.split("0"), norm=None, token_pattern=None, preprocessor=lambda s: s, ).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_hashingvectorizer_stop_word(): ref = SkHashVect(stop_words="english").fit_transform(DOCS) res = HashingVectorizer(stop_words="english").fit_transform(DOCS_GPU) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())