Пример #1
0
def gpu_hashing_vectorizer(x):
    vec = HashingVectorizer(n_features=N_FEATURES,
                            alternate_sign=alternate_sign,
                            ngram_range=ngram_range,
                            norm=norm,
                            preprocessor=preprocessor)
    return vec.fit_transform(x)
Пример #2
0
def test_hashingvectorizer_norm(norm):
    if norm not in ["l1", "l2", None]:
        with pytest.raises(ValueError):
            res = HashingVectorizer(norm=norm).fit_transform(DOCS_GPU)
    else:
        res = HashingVectorizer(norm=norm).fit_transform(DOCS_GPU)
        ref = SkHashVect(norm=norm).fit_transform(DOCS)
        assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
Пример #3
0
def test_hashingvectorizer_lowercase(lowercase):
    corpus = [
        "This Is DoC",
        "this DoC is the second DoC.",
        "And this document is the third one.",
        "and Is this the first document?",
    ]
    res = HashingVectorizer(lowercase=lowercase).fit_transform(Series(corpus))
    ref = SkHashVect(lowercase=lowercase).fit_transform(corpus)
    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
Пример #4
0
def test_hashingvectorizer():
    corpus = [
        "This is the first document.",
        "This document is the second document.",
        "And this is the third one.",
        "Is this the first document?",
    ]

    res = HashingVectorizer().fit_transform(Series(corpus))
    ref = SkHashVect().fit_transform(corpus)
    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
Пример #5
0
def test_hashingvectorizer_delimiter():
    corpus = ["a0b0c", "a 0 b0e", "c0d0f"]
    res = HashingVectorizer(
        delimiter="0", norm=None, preprocessor=lambda s: s
    ).fit_transform(Series(corpus))
    # equivalent logic for sklearn
    ref = SkHashVect(
        tokenizer=lambda s: s.split("0"),
        norm=None,
        token_pattern=None,
        preprocessor=lambda s: s,
    ).fit_transform(corpus)
    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
Пример #6
0
def test_vectorizer_empty_token_case():
    """
    We ignore empty tokens right now but sklearn treats them as a character
    we might want to look into this more but
    this should not be a concern for most piplines
    """
    corpus = [
        "a b ",
    ]

    # we have extra null token here
    # we slightly diverge from sklearn here as not treating it as a token
    res = CountVectorizer(preprocessor=lambda s: s).\
        fit_transform(Series(corpus))
    ref = SkCountVect(
        preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ")
    ).fit_transform(corpus)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())

    res = HashingVectorizer(preprocessor=lambda s: s).\
        fit_transform(Series(corpus))
    ref = SkHashVect(
        preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ")
    ).fit_transform(corpus)
    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
Пример #7
0
def test_hashingvectorizer_n_features():
    n_features = 10
    res = (
        HashingVectorizer(n_features=n_features)
        .fit_transform(DOCS_GPU).todense().get()
    )
    ref = SkHashVect(n_features=n_features).fit_transform(DOCS).toarray()
    assert res.shape == ref.shape
Пример #8
0
def test_hashingvectorizer_alternate_sign():
    # if alternate_sign = True
    # we should have some negative and positive values
    res = HashingVectorizer(alternate_sign=True).fit_transform(DOCS_GPU)
    res_f_array = res.todense().get().flatten()
    assert np.sum(res_f_array > 0, axis=0) > 0
    assert np.sum(res_f_array < 0, axis=0) > 0

    # if alternate_sign = False
    # we should have no negative values and some positive values
    res = HashingVectorizer(alternate_sign=False).fit_transform(DOCS_GPU)
    res_f_array = res.todense().get().flatten()
    assert np.sum(res_f_array > 0, axis=0) > 0
    assert np.sum(res_f_array < 0, axis=0) == 0
Пример #9
0
def test_hashingvectorizer_dtype(dtype):
    res = HashingVectorizer(dtype=dtype).fit_transform(DOCS_GPU)
    assert res.dtype == dtype
Пример #10
0
def test_hashingvectorizer_stop_word():
    ref = SkHashVect(stop_words="english").fit_transform(DOCS)
    res = HashingVectorizer(stop_words="english").fit_transform(DOCS_GPU)
    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())