def test_countvectorizer_max_features_counts(): JUNK_FOOD_DOCS_GPU = Series(JUNK_FOOD_DOCS) cv_1 = CountVectorizer(max_features=1) cv_3 = CountVectorizer(max_features=3) cv_None = CountVectorizer(max_features=None) counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0) counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0) counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0) features_1 = cv_1.get_feature_names() features_3 = cv_3.get_feature_names() features_None = cv_None.get_feature_names() # The most common feature is "the", with frequency 7. assert 7 == counts_1.max() assert 7 == counts_3.max() assert 7 == counts_None.max() # The most common feature should be the same def as_index(x): return x.astype(cp.int32).item() assert "the" == features_1[as_index(cp.argmax(counts_1))] assert "the" == features_3[as_index(cp.argmax(counts_3))] assert "the" == features_None[as_index(cp.argmax(counts_None))]
def test_sngle_len(): single_token_ser = ['S I N G L E T 0 K E N Example', '1 2 3 4 5 eg'] single_token_gpu = Series(single_token_ser) cv = CountVectorizer() res = cv.fit_transform(single_token_gpu) ref = SkCountVect().fit_transform(single_token_ser) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_non_ascii(): non_ascii = ('This is ascii,', 'but not this Αγγλικά.') non_ascii_gpu = Series(non_ascii) cv = CountVectorizer() res = cv.fit_transform(non_ascii_gpu) ref = SkCountVect().fit_transform(non_ascii) assert 'αγγλικά' in set(cv.get_feature_names().to_arrow().to_pylist()) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_count_binary_occurrences(): # by default multiple occurrences are counted as longs test_data = Series(['aaabc', 'abbde']) vect = CountVectorizer(analyzer='char', max_df=1.0) X = cp.asnumpy(vect.fit_transform(test_data).todense()) assert_array_equal(['a', 'b', 'c', 'd', 'e'], vect.get_feature_names().to_arrow().to_pylist()) assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X) # using boolean features, we can fetch the binary occurrence info # instead. vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True) X = cp.asnumpy(vect.fit_transform(test_data).todense()) assert_array_equal([[1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], X) # check the ability to change the dtype vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True, dtype=cp.float32) X = vect.fit_transform(test_data) assert X.dtype == cp.float32
def test_vectorizer_inverse_transform(): vectorizer = CountVectorizer() transformed_data = vectorizer.fit_transform(DOCS_GPU) inversed_data = vectorizer.inverse_transform(transformed_data) sk_vectorizer = SkCountVect() sk_transformed_data = sk_vectorizer.fit_transform(DOCS) sk_inversed_data = sk_vectorizer.inverse_transform(sk_transformed_data) for doc, sk_doc in zip(inversed_data, sk_inversed_data): doc = np.sort(doc.to_arrow().to_pylist()) sk_doc = np.sort(sk_doc) if len(doc) + len(sk_doc) == 0: continue assert_array_equal(doc, sk_doc)