示例#1
0
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer,
                             normalize, fit_unique, test_text_info):
    test_text, vocabulary = test_text_info
    model = DocVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
        fit_unique=fit_unique,
    )

    result = model.fit_transform(test_text)
    assert model.tokenizer_.tokenize_by == "document"
    transform = model.transform(test_text)
    assert np.allclose(result.toarray(), transform.toarray())
    if test_text == test_text_example:
        if vectorizer == "bow":
            assert result.shape == (5, 7)
        if vectorizer == "bigram":
            assert result.shape == (5, 19)
    else:
        assert result.shape[0] == len(test_text)
        if (token_contractor is None) and (vectorizer == "bow"):
            output_vocab = set(model.column_label_dictionary_.keys())
            lower_vocabulary = set([x.lower() for x in vocabulary] + [" "])
            note(output_vocab.difference(lower_vocabulary))
            assert output_vocab.issubset(lower_vocabulary)
示例#2
0
def test_featurebasisconverter_tokenized():
    converter = FeatureBasisConverter(word_vectorizer="tokenized",
                                      n_components=3)
    converter.fit(test_text_token_data)
    doc_vectorizer = DocVectorizer(tokenizer=None, token_contractor=None)
    doc_rep = doc_vectorizer.fit_transform(test_text_token_data)
    new_rep = converter.change_basis(doc_rep,
                                     doc_vectorizer.column_index_dictionary_)
    assert new_rep.shape == (7, 3)
示例#3
0
def test_docvectorizer_vocabulary(test_text_info):
    test_text, vocabulary = test_text_info
    if test_text == test_text_example:
        vocab = ["foo", "bar"]
    else:
        vocab = test_text[0].split()[:2]
        note(vocab)
    model = DocVectorizer(token_dictionary=vocab)
    results = model.fit_transform(test_text)
    assert results.shape == (len(test_text), 2)
    assert model.token_dictionary == vocab
示例#4
0
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer,
                             normalize):
    model = DocVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
    )
    result = model.fit_transform(test_text)
    assert model.tokenizer_.tokenize_by == "document"
    transform = model.transform(test_text)
    assert (result != transform).nnz == 0
    if vectorizer == "bow":
        assert result.shape == (5, 7)
    if vectorizer == "bigram":
        assert result.shape == (5, 19)
示例#5
0
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer,
                             normalize, fit_unique):
    model = DocVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
        fit_unique=fit_unique,
    )

    result = model.fit_transform(test_text)
    assert model.tokenizer_.tokenize_by == "document"
    transform = model.transform(test_text)
    assert np.allclose(result.toarray(), transform.toarray())
    if vectorizer == "bow":
        assert result.shape == (5, 7)
    if vectorizer == "bigram":
        assert result.shape == (5, 19)
示例#6
0
def test_docvectorizer_vocabulary():
    model = DocVectorizer(token_dictionary=["foo", "bar"])
    results = model.fit_transform(test_text)
    assert results.shape == (5, 2)