Exemplo n.º 1
0
def test_countvectorizer_max_features_counts():
    JUNK_FOOD_DOCS_GPU = Series(JUNK_FOOD_DOCS)

    cv_1 = CountVectorizer(max_features=1)
    cv_3 = CountVectorizer(max_features=3)
    cv_None = CountVectorizer(max_features=None)

    counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)
    counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)
    counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)

    features_1 = cv_1.get_feature_names()
    features_3 = cv_3.get_feature_names()
    features_None = cv_None.get_feature_names()

    # The most common feature is "the", with frequency 7.
    assert 7 == counts_1.max()
    assert 7 == counts_3.max()
    assert 7 == counts_None.max()

    # The most common feature should be the same
    def as_index(x):
        return x.astype(cp.int32).item()
    assert "the" == features_1[as_index(cp.argmax(counts_1))]
    assert "the" == features_3[as_index(cp.argmax(counts_3))]
    assert "the" == features_None[as_index(cp.argmax(counts_None))]
Exemplo n.º 2
0
def test_sngle_len():
    single_token_ser = ['S I N G L E T 0 K E N Example', '1 2 3 4 5 eg']
    single_token_gpu = Series(single_token_ser)

    cv = CountVectorizer()
    res = cv.fit_transform(single_token_gpu)
    ref = SkCountVect().fit_transform(single_token_ser)

    cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_non_ascii():
    non_ascii = ('This is ascii,', 'but not this Αγγλικά.')
    non_ascii_gpu = Series(non_ascii)

    cv = CountVectorizer()
    res = cv.fit_transform(non_ascii_gpu)
    ref = SkCountVect().fit_transform(non_ascii)

    assert 'αγγλικά' in set(cv.get_feature_names().to_arrow().to_pylist())
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
Exemplo n.º 4
0
def test_count_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = Series(['aaabc', 'abbde'])
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal(['a', 'b', 'c', 'd', 'e'],
                       vect.get_feature_names().to_arrow().to_pylist())
    assert_array_equal([[3, 1, 1, 0, 0],
                        [1, 2, 0, 1, 1]], X)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal([[1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 1]], X)

    # check the ability to change the dtype
    vect = CountVectorizer(analyzer='char', max_df=1.0,
                           binary=True, dtype=cp.float32)
    X = vect.fit_transform(test_data)
    assert X.dtype == cp.float32
Exemplo n.º 5
0
def test_vectorizer_inverse_transform():
    vectorizer = CountVectorizer()
    transformed_data = vectorizer.fit_transform(DOCS_GPU)
    inversed_data = vectorizer.inverse_transform(transformed_data)

    sk_vectorizer = SkCountVect()
    sk_transformed_data = sk_vectorizer.fit_transform(DOCS)
    sk_inversed_data = sk_vectorizer.inverse_transform(sk_transformed_data)

    for doc, sk_doc in zip(inversed_data, sk_inversed_data):
        doc = np.sort(doc.to_arrow().to_pylist())
        sk_doc = np.sort(sk_doc)
        if len(doc) + len(sk_doc) == 0:
            continue
        assert_array_equal(doc, sk_doc)