def test_countvectorizer_stop_words_ngrams(): stop_words_doc = Series(["and me too andy andy too"]) expected_vocabulary = ["andy andy"] v = CountVectorizer(ngram_range=(2, 2), stop_words='english') v.fit(stop_words_doc) assert expected_vocabulary == v.get_feature_names().to_arrow().to_pylist()
def test_character_ngrams(analyzer, ngram_range): data = ['ab c', '' 'edf gh'] res = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range) res.fit(Series(data)) ref = SkCountVect(analyzer=analyzer, ngram_range=ngram_range).fit(data) assert (ref.get_feature_names() ) == res.get_feature_names().to_arrow().to_pylist()
def test_countvectorizer_max_features(): expected_vocabulary = {'burger', 'beer', 'salad', 'pizza'} expected_stop_words = {'celeri', 'tomato', 'copyright', 'coke', 'sparkling', 'water', 'the'} # test bounded number of extracted features vec = CountVectorizer(max_df=0.6, max_features=4) vec.fit(DOCS_GPU) assert set(vec.get_feature_names().to_arrow().to_pylist() ) == expected_vocabulary assert set(vec.stop_words_.to_arrow().to_pylist()) == expected_stop_words
def test_countvectorizer_max_df(): test_data = Series(['abc', 'dea', 'eat']) vect = CountVectorizer(analyzer='char', max_df=1.0) vect.fit(test_data) assert 'a' in vect.vocabulary_.to_arrow().to_pylist() assert len(vect.vocabulary_.to_arrow().to_pylist()) == 6 assert len(vect.stop_words_) == 0 vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5 vect.fit(test_data) assert 'a' not in vect.vocabulary_.to_arrow().to_pylist() # {ae} ignored assert len(vect.vocabulary_.to_arrow().to_pylist()) == 4 # {bcdt} remain assert 'a' in vect.stop_words_.to_arrow().to_pylist() assert len(vect.stop_words_) == 2 vect.max_df = 1 vect.fit(test_data) assert 'a' not in vect.vocabulary_.to_arrow().to_pylist() # {ae} ignored assert len(vect.vocabulary_.to_arrow().to_pylist()) == 4 # {bcdt} remain assert 'a' in vect.stop_words_.to_arrow().to_pylist() assert len(vect.stop_words_) == 2
def test_vectorizer_min_df(): test_data = Series(['abc', 'dea', 'eat']) vect = CountVectorizer(analyzer='char', min_df=1) vect.fit(test_data) assert 'a' in vect.vocabulary_.to_arrow().to_pylist() assert len(vect.vocabulary_.to_arrow().to_pylist()) == 6 assert len(vect.stop_words_) == 0 vect.min_df = 2 vect.fit(test_data) assert 'c' not in vect.vocabulary_.to_arrow().to_pylist() # {bcdt} ignored assert len(vect.vocabulary_.to_arrow().to_pylist()) == 2 # {ae} remain assert 'c' in vect.stop_words_.to_arrow().to_pylist() assert len(vect.stop_words_) == 4 vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4 vect.fit(test_data) # {bcdet} ignored assert 'c' not in vect.vocabulary_.to_arrow().to_pylist() assert len(vect.vocabulary_.to_arrow().to_pylist()) == 1 # {a} remains assert 'c' in vect.stop_words_.to_arrow().to_pylist() assert len(vect.stop_words_) == 5
def test_countvectorizer_empty_vocabulary(): v = CountVectorizer(max_df=1.0, stop_words="english") # fitting only on stopwords will result in an empty vocabulary with pytest.raises(ValueError): v.fit(Series(["to be or not to be", "and me too", "and so do you"]))