def test_countvectorizer_custom_vocabulary(): vocab = {"pizza": 0, "beer": 1} vocab_gpu = Series(vocab.keys()) ref = SkCountVect(vocabulary=vocab).fit_transform(DOCS) X = CountVectorizer(vocabulary=vocab_gpu).fit_transform(DOCS_GPU) cp.testing.assert_array_equal(X.todense(), ref.toarray())
def test_vectorizer_empty_token_case(): """ We ignore empty tokens right now but sklearn treats them as a character we might want to look into this more but this should not be a concern for most piplines """ corpus = [ "a b ", ] # we have extra null token here # we slightly diverge from sklearn here as not treating it as a token res = CountVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkCountVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) cp.testing.assert_array_equal(res.todense(), ref.toarray()) res = HashingVectorizer(preprocessor=lambda s: s).\ fit_transform(Series(corpus)) ref = SkHashVect( preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ") ).fit_transform(corpus) assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
def test_space_ngrams(ngram_range): data = ['abc def. 123 456 789'] data_gpu = Series(data) vec = CountVectorizer(ngram_range=ngram_range).fit(data_gpu) ref = SkCountVect(ngram_range=ngram_range).fit(data) assert (ref.get_feature_names() ) == vec.get_feature_names().to_arrow().to_pylist()
def test_only_delimiters(): data = ['abc def. 123', ' ', '456 789'] data_gpu = Series(data) res = CountVectorizer().fit_transform(data_gpu) ref = SkCountVect().fit_transform(data) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_empty_doc_after_limit_features(): data = ['abc abc def', 'def abc', 'ghi'] data_gpu = Series(data) count = CountVectorizer(min_df=2).fit_transform(data_gpu) ref = SkCountVect(min_df=2).fit_transform(data) cp.testing.assert_array_equal(count.todense(), ref.toarray())
def test_sngle_len(): single_token_ser = ['S I N G L E T 0 K E N Example', '1 2 3 4 5 eg'] single_token_gpu = Series(single_token_ser) cv = CountVectorizer() res = cv.fit_transform(single_token_gpu) ref = SkCountVect().fit_transform(single_token_ser) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_character_ngrams(analyzer, ngram_range): data = ['ab c', '' 'edf gh'] res = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range) res.fit(Series(data)) ref = SkCountVect(analyzer=analyzer, ngram_range=ngram_range).fit(data) assert (ref.get_feature_names() ) == res.get_feature_names().to_arrow().to_pylist()
def test_non_ascii(): non_ascii = ('This is ascii,', 'but not this Αγγλικά.') non_ascii_gpu = Series(non_ascii) cv = CountVectorizer() res = cv.fit_transform(non_ascii_gpu) ref = SkCountVect().fit_transform(non_ascii) assert 'αγγλικά' in set(cv.get_feature_names().to_arrow().to_pylist()) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_count_vectorizer(): corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] res = CountVectorizer().fit_transform(Series(corpus)) ref = SkCountVect().fit_transform(corpus) cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_vectorizer_inverse_transform(): vectorizer = CountVectorizer() transformed_data = vectorizer.fit_transform(DOCS_GPU) inversed_data = vectorizer.inverse_transform(transformed_data) sk_vectorizer = SkCountVect() sk_transformed_data = sk_vectorizer.fit_transform(DOCS) sk_inversed_data = sk_vectorizer.inverse_transform(sk_transformed_data) for doc, sk_doc in zip(inversed_data, sk_inversed_data): doc = np.sort(doc.to_arrow().to_pylist()) sk_doc = np.sort(sk_doc) if len(doc) + len(sk_doc) == 0: continue assert_array_equal(doc, sk_doc)
def test_countvectorizer_stop_words(): ref = SkCountVect(stop_words='english').fit_transform(DOCS) X = CountVectorizer(stop_words='english').fit_transform(DOCS_GPU) cp.testing.assert_array_equal(X.todense(), ref.toarray())
def test_word_analyzer(ngram_range): v = CountVectorizer(ngram_range=ngram_range).fit(DOCS_GPU) ref = SkCountVect(ngram_range=ngram_range).fit(DOCS) assert ( ref.get_feature_names() == v.get_feature_names().to_arrow().to_pylist() )
def test_countvectorizer_separate_fit_transform(): res = CountVectorizer().fit(DOCS_GPU).transform(DOCS_GPU) ref = SkCountVect().fit(DOCS).transform(DOCS) cp.testing.assert_array_equal(res.todense(), ref.toarray())