def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize, fit_unique, test_text_info): test_text, vocabulary = test_text_info model = DocVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, fit_unique=fit_unique, ) result = model.fit_transform(test_text) assert model.tokenizer_.tokenize_by == "document" transform = model.transform(test_text) assert np.allclose(result.toarray(), transform.toarray()) if test_text == test_text_example: if vectorizer == "bow": assert result.shape == (5, 7) if vectorizer == "bigram": assert result.shape == (5, 19) else: assert result.shape[0] == len(test_text) if (token_contractor is None) and (vectorizer == "bow"): output_vocab = set(model.column_label_dictionary_.keys()) lower_vocabulary = set([x.lower() for x in vocabulary] + [" "]) note(output_vocab.difference(lower_vocabulary)) assert output_vocab.issubset(lower_vocabulary)
def test_featurebasisconverter_tokenized(): converter = FeatureBasisConverter(word_vectorizer="tokenized", n_components=3) converter.fit(test_text_token_data) doc_vectorizer = DocVectorizer(tokenizer=None, token_contractor=None) doc_rep = doc_vectorizer.fit_transform(test_text_token_data) new_rep = converter.change_basis(doc_rep, doc_vectorizer.column_index_dictionary_) assert new_rep.shape == (7, 3)
def test_docvectorizer_vocabulary(test_text_info): test_text, vocabulary = test_text_info if test_text == test_text_example: vocab = ["foo", "bar"] else: vocab = test_text[0].split()[:2] note(vocab) model = DocVectorizer(token_dictionary=vocab) results = model.fit_transform(test_text) assert results.shape == (len(test_text), 2) assert model.token_dictionary == vocab
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize): model = DocVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, ) result = model.fit_transform(test_text) assert model.tokenizer_.tokenize_by == "document" transform = model.transform(test_text) assert (result != transform).nnz == 0 if vectorizer == "bow": assert result.shape == (5, 7) if vectorizer == "bigram": assert result.shape == (5, 19)
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer, normalize, fit_unique): model = DocVectorizer( tokenizer=tokenizer, token_contractor=token_contractor, vectorizer=vectorizer, normalize=normalize, fit_unique=fit_unique, ) result = model.fit_transform(test_text) assert model.tokenizer_.tokenize_by == "document" transform = model.transform(test_text) assert np.allclose(result.toarray(), transform.toarray()) if vectorizer == "bow": assert result.shape == (5, 7) if vectorizer == "bigram": assert result.shape == (5, 19)
def test_docvectorizer_vocabulary(): model = DocVectorizer(token_dictionary=["foo", "bar"]) results = model.fit_transform(test_text) assert results.shape == (5, 2)