Пример #1
0
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer,
                             normalize, fit_unique, test_text_info):
    test_text, vocabulary = test_text_info
    model = DocVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
        fit_unique=fit_unique,
    )

    result = model.fit_transform(test_text)
    assert model.tokenizer_.tokenize_by == "document"
    transform = model.transform(test_text)
    assert np.allclose(result.toarray(), transform.toarray())
    if test_text == test_text_example:
        if vectorizer == "bow":
            assert result.shape == (5, 7)
        if vectorizer == "bigram":
            assert result.shape == (5, 19)
    else:
        assert result.shape[0] == len(test_text)
        if (token_contractor is None) and (vectorizer == "bow"):
            output_vocab = set(model.column_label_dictionary_.keys())
            lower_vocabulary = set([x.lower() for x in vocabulary] + [" "])
            note(output_vocab.difference(lower_vocabulary))
            assert output_vocab.issubset(lower_vocabulary)
Пример #2
0
def test_featurebasisconverter_tokenized():
    converter = FeatureBasisConverter(word_vectorizer="tokenized",
                                      n_components=3)
    converter.fit(test_text_token_data)
    doc_vectorizer = DocVectorizer(tokenizer=None, token_contractor=None)
    doc_rep = doc_vectorizer.fit_transform(test_text_token_data)
    new_rep = converter.change_basis(doc_rep,
                                     doc_vectorizer.column_index_dictionary_)
    assert new_rep.shape == (7, 3)
Пример #3
0
def test_docvectorizer_todataframe(test_text_info):
    test_text, vocabulary = test_text_info
    model = DocVectorizer().fit(test_text)
    df = model.to_DataFrame()
    if test_text == test_text_example:
        assert df.shape == (5, 7)
    else:
        assert df.shape[0] == len(test_text)
        assert df.shape[1] <= len(vocabulary)
Пример #4
0
def test_docvectorizer_vocabulary(test_text_info):
    test_text, vocabulary = test_text_info
    if test_text == test_text_example:
        vocab = ["foo", "bar"]
    else:
        vocab = test_text[0].split()[:2]
        note(vocab)
    model = DocVectorizer(token_dictionary=vocab)
    results = model.fit_transform(test_text)
    assert results.shape == (len(test_text), 2)
    assert model.token_dictionary == vocab
Пример #5
0
def test_docvectorizer_unique():
    with pytest.raises(ValueError):
        model_unique = DocVectorizer(token_contractor_kwds={
            "min_score": 25
        },
                                     fit_unique=True).fit(test_text_example)
        assert "foo_bar" not in model_unique.column_label_dictionary_
        model_duplicates = DocVectorizer(
            token_contractor_kwds={
                "min_score": 25
            }, fit_unique=False).fit(test_text_example)
        assert "foo_bar" in model_duplicates.column_label_dictionary_
Пример #6
0
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer,
                             normalize):
    model = DocVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
    )
    result = model.fit_transform(test_text)
    assert model.tokenizer_.tokenize_by == "document"
    transform = model.transform(test_text)
    assert (result != transform).nnz == 0
    if vectorizer == "bow":
        assert result.shape == (5, 7)
    if vectorizer == "bigram":
        assert result.shape == (5, 19)
Пример #7
0
def test_docvectorizer_basic(tokenizer, token_contractor, vectorizer,
                             normalize, fit_unique):
    model = DocVectorizer(
        tokenizer=tokenizer,
        token_contractor=token_contractor,
        vectorizer=vectorizer,
        normalize=normalize,
        fit_unique=fit_unique,
    )

    result = model.fit_transform(test_text)
    assert model.tokenizer_.tokenize_by == "document"
    transform = model.transform(test_text)
    assert np.allclose(result.toarray(), transform.toarray())
    if vectorizer == "bow":
        assert result.shape == (5, 7)
    if vectorizer == "bigram":
        assert result.shape == (5, 19)
Пример #8
0
def test_docvectorizer_todataframe():
    model = DocVectorizer().fit(test_text)
    df = model.to_DataFrame()
    assert df.shape == (5, 7)
Пример #9
0
def test_docvectorizer_tokenizer():
    # Should raise an error if the tokenizer is not instantiated
    with pytest.raises(TypeError):
        DocVectorizer(tokenizer=NLTKTokenizer)
Пример #10
0
def test_spacy_tokenizer():
    tokenizer = SpaCyTokenizer()
    vectorizer = DocVectorizer(tokenizer=tokenizer)
    result = vectorizer.fit(test_text)
Пример #11
0
def test_docvectorizer_basic():
    vectorizer = DocVectorizer()
    result = vectorizer.fit(test_text)
Пример #12
0
def test_docvectorizer_vocabulary():
    model = DocVectorizer(token_dictionary=["foo", "bar"])
    results = model.fit_transform(test_text)
    assert results.shape == (5, 2)