def test_ngram_vectorizer_text(): vectorizer = NgramVectorizer() result = vectorizer.fit_transform(text_token_data) assert scipy.sparse.issparse(result) # Ensure that the empty document has an all zero row assert len((result[1, :]).data) == 0
def test_ngram_vectorizer_min_doc_freq(): vectorizer = NgramVectorizer(min_document_frequency=0.6) count_matrix = vectorizer.fit_transform(text_token_data_permutation) assert count_matrix.shape == (3, 2) assert np.all(count_matrix.toarray() == np.array([[1, 1], [1, 0], [1, 1]]))
def test_ngram_vectorizer_max_doc(): vectorizer = NgramVectorizer(max_document_occurrences=1) count_matrix = vectorizer.fit_transform(text_token_data_permutation) assert count_matrix.shape == (3, 2) assert np.all(count_matrix.toarray() == np.array([[0, 0], [1, 0], [0, 1]]))