def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize, fit_unique, test_text_info): test_text, vocabulary = test_text_info model = DocVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, fit_unique=fit_unique, ) result = model.fit_transform(test_text) assert model.tokenizer_.tokenize_by == "document" transform = model.transform(test_text) assert np.allclose(result.toarray(), transform.toarray()) if test_text == test_text_example: if vectorizer == "bow": assert result.shape == (5, 7) if vectorizer == "bigram": assert result.shape == (5, 19) else: assert result.shape[0] == len(test_text) if (token_contractor is None) and (vectorizer == "bow"): output_vocab = set(model.column_label_dictionary_.keys()) lower_vocabulary = set([x.lower() for x in vocabulary] + [" "]) note(output_vocab.difference(lower_vocabulary)) assert output_vocab.issubset(lower_vocabulary)
def test_featurebasisconverter_tokenized(): converter = FeatureBasisConverter(word_vectorizer="tokenized", n_components=3) converter.fit(test_text_token_data) doc_vectorizer = DocVectorizer(tokenizer=None, token_contractor=None) doc_rep = doc_vectorizer.fit_transform(test_text_token_data) new_rep = converter.change_basis(doc_rep, doc_vectorizer.column_index_dictionary_) assert new_rep.shape == (7, 3)
def test_docvectorizer_todataframe(test_text_info): test_text, vocabulary = test_text_info model = DocVectorizer().fit(test_text) df = model.to_DataFrame() if test_text == test_text_example: assert df.shape == (5, 7) else: assert df.shape[0] == len(test_text) assert df.shape[1] <= len(vocabulary)
def test_docvectorizer_vocabulary(test_text_info): test_text, vocabulary = test_text_info if test_text == test_text_example: vocab = ["foo", "bar"] else: vocab = test_text[0].split()[:2] note(vocab) model = DocVectorizer(token_dictionary=vocab) results = model.fit_transform(test_text) assert results.shape == (len(test_text), 2) assert model.token_dictionary == vocab
def test_docvectorizer_unique(): with pytest.raises(ValueError): model_unique = DocVectorizer(token_contractor_kwds={ "min_score": 25 }, fit_unique=True).fit(test_text_example) assert "foo_bar" not in model_unique.column_label_dictionary_ model_duplicates = DocVectorizer( token_contractor_kwds={ "min_score": 25 }, fit_unique=False).fit(test_text_example) assert "foo_bar" in model_duplicates.column_label_dictionary_
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize): model = DocVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, ) result = model.fit_transform(test_text) assert model.tokenizer_.tokenize_by == "document" transform = model.transform(test_text) assert (result != transform).nnz == 0 if vectorizer == "bow": assert result.shape == (5, 7) if vectorizer == "bigram": assert result.shape == (5, 19)
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize, fit_unique): model = DocVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, fit_unique=fit_unique, ) result = model.fit_transform(test_text) assert model.tokenizer_.tokenize_by == "document" transform = model.transform(test_text) assert np.allclose(result.toarray(), transform.toarray()) if vectorizer == "bow": assert result.shape == (5, 7) if vectorizer == "bigram": assert result.shape == (5, 19)
def test_docvectorizer_todataframe(): model = DocVectorizer().fit(test_text) df = model.to_DataFrame() assert df.shape == (5, 7)
def test_docvectorizer_tokenizer(): # Should raise an error if the tokenizer is not instantiated with pytest.raises(TypeError): DocVectorizer(tokenizer=NLTKTokenizer)
def test_spacy_tokenizer(): tokenizer = SpaCyTokenizer() vectorizer = DocVectorizer(tokenizer=tokenizer) result = vectorizer.fit(test_text)
def test_docvectorizer_basic(): vectorizer = DocVectorizer() result = vectorizer.fit(test_text)
def test_docvectorizer_vocabulary(): model = DocVectorizer(token_dictionary=["foo", "bar"]) results = model.fit_transform(test_text) assert results.shape == (5, 2)