Exemplo n.º 1
0
def test_vectorizer_empty_token_case():
    """
    We ignore empty tokens right now but sklearn treats them as a character
    we might want to look into this more but
    this should not be a concern for most piplines
    """
    corpus = [
        "a b ",
    ]

    # we have extra null token here
    # we slightly diverge from sklearn here as not treating it as a token
    res = CountVectorizer(preprocessor=lambda s: s).\
        fit_transform(Series(corpus))
    ref = SkCountVect(
        preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ")
    ).fit_transform(corpus)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())

    res = HashingVectorizer(preprocessor=lambda s: s).\
        fit_transform(Series(corpus))
    ref = SkHashVect(
        preprocessor=lambda s: s, tokenizer=lambda s: s.split(" ")
    ).fit_transform(corpus)
    assert_almost_equal_hash_matrices(res.todense().get(), ref.toarray())
Exemplo n.º 2
0
def test_countvectorizer_custom_vocabulary():
    vocab = {"pizza": 0, "beer": 1}
    vocab_gpu = Series(vocab.keys())

    ref = SkCountVect(vocabulary=vocab).fit_transform(DOCS)
    X = CountVectorizer(vocabulary=vocab_gpu).fit_transform(DOCS_GPU)
    cp.testing.assert_array_equal(X.todense(), ref.toarray())
Exemplo n.º 3
0
def test_space_ngrams(ngram_range):
    data = ['abc      def. 123 456    789']
    data_gpu = Series(data)
    vec = CountVectorizer(ngram_range=ngram_range).fit(data_gpu)
    ref = SkCountVect(ngram_range=ngram_range).fit(data)
    assert (ref.get_feature_names()
            ) == vec.get_feature_names().to_arrow().to_pylist()
Exemplo n.º 4
0
def test_empty_doc_after_limit_features():
    data = ['abc abc def',
            'def abc',
            'ghi']
    data_gpu = Series(data)
    count = CountVectorizer(min_df=2).fit_transform(data_gpu)
    ref = SkCountVect(min_df=2).fit_transform(data)
    cp.testing.assert_array_equal(count.todense(), ref.toarray())
Exemplo n.º 5
0
def test_only_delimiters():
    data = ['abc def. 123',
            '   ',
            '456 789']
    data_gpu = Series(data)
    res = CountVectorizer().fit_transform(data_gpu)
    ref = SkCountVect().fit_transform(data)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
Exemplo n.º 6
0
def test_countvectorizer_stop_words_ngrams():
    stop_words_doc = Series(["and me too andy andy too"])
    expected_vocabulary = ["andy andy"]

    v = CountVectorizer(ngram_range=(2, 2), stop_words='english')
    v.fit(stop_words_doc)

    assert expected_vocabulary == v.get_feature_names().to_arrow().to_pylist()
Exemplo n.º 7
0
def test_sngle_len():
    single_token_ser = ['S I N G L E T 0 K E N Example', '1 2 3 4 5 eg']
    single_token_gpu = Series(single_token_ser)

    cv = CountVectorizer()
    res = cv.fit_transform(single_token_gpu)
    ref = SkCountVect().fit_transform(single_token_ser)

    cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_non_ascii():
    non_ascii = ('This is ascii,', 'but not this Αγγλικά.')
    non_ascii_gpu = Series(non_ascii)

    cv = CountVectorizer()
    res = cv.fit_transform(non_ascii_gpu)
    ref = SkCountVect().fit_transform(non_ascii)

    assert 'αγγλικά' in set(cv.get_feature_names().to_arrow().to_pylist())
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
def test_character_ngrams(analyzer, ngram_range):
    data = ['ab c', '' 'edf gh']

    res = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range)
    res.fit(Series(data))

    ref = SkCountVect(analyzer=analyzer, ngram_range=ngram_range).fit(data)

    assert (ref.get_feature_names()
            ) == res.get_feature_names().to_arrow().to_pylist()
Exemplo n.º 10
0
def test_countvectorizer_max_features():
    expected_vocabulary = {'burger', 'beer', 'salad', 'pizza'}
    expected_stop_words = {'celeri', 'tomato', 'copyright', 'coke',
                           'sparkling', 'water', 'the'}

    # test bounded number of extracted features
    vec = CountVectorizer(max_df=0.6, max_features=4)
    vec.fit(DOCS_GPU)
    assert set(vec.get_feature_names().to_arrow().to_pylist()
               ) == expected_vocabulary
    assert set(vec.stop_words_.to_arrow().to_pylist()) == expected_stop_words
Exemplo n.º 11
0
def test_count_vectorizer():
    corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]

    res = CountVectorizer().fit_transform(Series(corpus))
    ref = SkCountVect().fit_transform(corpus)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
Exemplo n.º 12
0
def test_transform_unsigned_categories(query):
    token = 'a'
    thousand_tokens = list()
    for i in range(1000):
        thousand_tokens.append(token)
        token += 'a'
    thousand_tokens[128] = 'science'

    vec = CountVectorizer().fit(Series(thousand_tokens))
    res = vec.transform(query)

    assert res.shape[0] == len(query)
Exemplo n.º 13
0
def test_vectorizer_inverse_transform():
    vectorizer = CountVectorizer()
    transformed_data = vectorizer.fit_transform(DOCS_GPU)
    inversed_data = vectorizer.inverse_transform(transformed_data)

    sk_vectorizer = SkCountVect()
    sk_transformed_data = sk_vectorizer.fit_transform(DOCS)
    sk_inversed_data = sk_vectorizer.inverse_transform(sk_transformed_data)

    for doc, sk_doc in zip(inversed_data, sk_inversed_data):
        doc = np.sort(doc.to_arrow().to_pylist())
        sk_doc = np.sort(sk_doc)
        if len(doc) + len(sk_doc) == 0:
            continue
        assert_array_equal(doc, sk_doc)
Exemplo n.º 14
0
def test_countvectorizer_max_df():
    test_data = Series(['abc', 'dea', 'eat'])
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.to_arrow().to_pylist()
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 6
    assert len(vect.stop_words_) == 0

    vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
    vect.fit(test_data)
    assert 'a' not in vect.vocabulary_.to_arrow().to_pylist()  # {ae} ignored
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 4    # {bcdt} remain
    assert 'a' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 2

    vect.max_df = 1
    vect.fit(test_data)
    assert 'a' not in vect.vocabulary_.to_arrow().to_pylist()  # {ae} ignored
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 4    # {bcdt} remain
    assert 'a' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 2
Exemplo n.º 15
0
def test_vectorizer_min_df():
    test_data = Series(['abc', 'dea', 'eat'])
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.to_arrow().to_pylist()
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 6
    assert len(vect.stop_words_) == 0

    vect.min_df = 2
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.to_arrow().to_pylist()  # {bcdt} ignored
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 2    # {ae} remain
    assert 'c' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 4

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    # {bcdet} ignored
    assert 'c' not in vect.vocabulary_.to_arrow().to_pylist()
    assert len(vect.vocabulary_.to_arrow().to_pylist()) == 1    # {a} remains
    assert 'c' in vect.stop_words_.to_arrow().to_pylist()
    assert len(vect.stop_words_) == 5
Exemplo n.º 16
0
def test_count_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = Series(['aaabc', 'abbde'])
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal(['a', 'b', 'c', 'd', 'e'],
                       vect.get_feature_names().to_arrow().to_pylist())
    assert_array_equal([[3, 1, 1, 0, 0],
                        [1, 2, 0, 1, 1]], X)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True)
    X = cp.asnumpy(vect.fit_transform(test_data).todense())
    assert_array_equal([[1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 1]], X)

    # check the ability to change the dtype
    vect = CountVectorizer(analyzer='char', max_df=1.0,
                           binary=True, dtype=cp.float32)
    X = vect.fit_transform(test_data)
    assert X.dtype == cp.float32
Exemplo n.º 17
0
def test_countvectorizer_separate_fit_transform():
    res = CountVectorizer().fit(DOCS_GPU).transform(DOCS_GPU)
    ref = SkCountVect().fit(DOCS).transform(DOCS)
    cp.testing.assert_array_equal(res.todense(), ref.toarray())
Exemplo n.º 18
0
def test_countvectorizer_empty_vocabulary():
    v = CountVectorizer(max_df=1.0, stop_words="english")
    # fitting only on stopwords will result in an empty vocabulary
    with pytest.raises(ValueError):
        v.fit(Series(["to be or not to be", "and me too", "and so do you"]))
Exemplo n.º 19
0
def test_countvectorizer_stop_words():
    ref = SkCountVect(stop_words='english').fit_transform(DOCS)
    X = CountVectorizer(stop_words='english').fit_transform(DOCS_GPU)
    cp.testing.assert_array_equal(X.todense(), ref.toarray())
Exemplo n.º 20
0
def test_word_analyzer(ngram_range):
    v = CountVectorizer(ngram_range=ngram_range).fit(DOCS_GPU)
    ref = SkCountVect(ngram_range=ngram_range).fit(DOCS)
    assert (
        ref.get_feature_names() == v.get_feature_names().to_arrow().to_pylist()
    )
Exemplo n.º 21
0
def test_countvectorizer_max_features_counts():
    JUNK_FOOD_DOCS_GPU = Series(JUNK_FOOD_DOCS)

    cv_1 = CountVectorizer(max_features=1)
    cv_3 = CountVectorizer(max_features=3)
    cv_None = CountVectorizer(max_features=None)

    counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)
    counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)
    counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS_GPU).sum(axis=0)

    features_1 = cv_1.get_feature_names()
    features_3 = cv_3.get_feature_names()
    features_None = cv_None.get_feature_names()

    # The most common feature is "the", with frequency 7.
    assert 7 == counts_1.max()
    assert 7 == counts_3.max()
    assert 7 == counts_None.max()

    # The most common feature should be the same
    def as_index(x):
        return x.astype(cp.int32).item()
    assert "the" == features_1[as_index(cp.argmax(counts_1))]
    assert "the" == features_3[as_index(cp.argmax(counts_3))]
    assert "the" == features_None[as_index(cp.argmax(counts_None))]