Exemplo n.º 1
0
def test_ngram_vectorizer_basic():
    vectorizer = NgramVectorizer()
    result = vectorizer.fit_transform(token_data)
    assert scipy.sparse.issparse(result)
    transform_result = vectorizer.transform(token_data)
    assert np.all(transform_result.data == result.data)
    assert np.all(transform_result.tocoo().col == result.tocoo().col)
Exemplo n.º 2
0
def test_bpe_tokens_ngram_matches():
    bpe1 = BytePairEncodingVectorizer(return_type="matrix")
    bpe2 = BytePairEncodingVectorizer(return_type="tokens")

    result1 = bpe1.fit_transform(raw_string_data)
    token_dictionary = {
        to_unicode(code, bpe1.tokens_, bpe1.max_char_code_): n
        for code, n in bpe1.column_label_dictionary_.items()
    }

    tokens = bpe2.fit_transform(raw_string_data)
    result2 = NgramVectorizer(token_dictionary=token_dictionary).fit_transform(tokens)

    assert np.allclose(result1.toarray(), result2.toarray())
Exemplo n.º 3
0
    def __init__(
            self,
            tokenizer=NLTKTokenizer(),
            ngram_vectorizer=NgramVectorizer(ngram_size=1),
            info_weight_transformer=InformationWeightTransformer(),
            remove_effects_transformer=RemoveEffectsTransformer(),
    ):
        """
        A class for converting documents into a fixed width representation.  Useful for
        comparing documents with each other.
        This is done via:
        1) Tokenization defaults to NLTK but can use stanza, spacy or a custom tokenizer.
        2) Converts this sequence of tokens into counts of n-grams (default 1-grams).
        3) Re-weights counts based on how informative the presence of an n-gram is within a document.
        4) Build a low rank model for how often we'd expect a completely random n-gram to occur your text
            and correct for this effect.

        Parameters
        ----------
        tokenizer = textmap.tokenizers.BaseTokenizer (default NLTKTokenizer)
            Takes an instantiation of a class that inherits from BaseTokenizer.
            These are classes which take documents are parse them into individual tokens,
            then optionally contract frequently co-occuring tokens together into a single
            token.
            Examples of such tokenizers can be found in textmap.tokenizers and include:
            1) NLTKTokenizer
            2) NLTKTweetTokenizer
            3) SKLearnTokenizer
            4) StanzaTokenizer
            5) SpaCyTokenizer
            
        ngram_vectorizer = vectorizer.NgramVectorizer (default NgramVectorizer(ngram_size=1))
            Takes an instance of a class which turns sequences of sequences of tokens into
            fixed width representation through counting the occurence of n-grams.
            In the default case this simply counts the number of occurrences of each token.
            This class returns a documents by n-gram sparse matrix of counts.
            
        info_weight_transformer = textmap.transformers.InformationWeightTransformer (default InformationWeightTransformer())
            Takes an instance of a class which re-weights the counts in a sparse matrix.
            It does this by building a low rank model of the probability of a word being contained
            in any document, converting that into information by applying a log and scaling our
            counts by this value.
            If this is set to None this step is skipped in the pipeline.
            
        remove_effect_transformer = textmap.transformer.RemoveEffectsTranformer (default RemoveEffectsTransformer())
            Takes an instance of a class which builds a low rank model for how often we'd expect a completely random word to occur your text
            and correct for this effect.
            If this is set to None this step is skipped in the pipeline.
        """
        self.tokenizer = tokenizer
        self.ngram_vectorizer = ngram_vectorizer
        # These are more minor.  I'd be willing to default them to a string to clean
        # up the docstring help.
        self.info_weight_transformer = info_weight_transformer
        self.remove_effects_transformer = remove_effects_transformer
Exemplo n.º 4
0
def test_summarize_embedding_list(dense, include_values):
    vect = NgramVectorizer()
    weight_matrix = vect.fit_transform(text_token_data)
    if dense:
        weight_matrix = weight_matrix.todense()
    summary = summarize_embedding(
        weight_matrix, vect.column_index_dictionary_, include_values=include_values
    )
    expected_result = (
        [
            ["foo", "wer", "pok"],
            [],
            ["bar", "foo", "wer"],
            ["wer", "foo", "bar"],
            ["bar", "foo", "wer"],
            ["wer", "pok", "foo"],
            ["wer", "foo", "pok"],
        ],
        [
            [2.0, 1.0, 1.0],
            [],
            [4.0, 3.0, 2.0],
            [2.0, 2.0, 2.0],
            [4.0, 3.0, 2.0],
            [3.0, 3.0, 3.0],
            [4.0, 4.0, 2.0],
        ],
    )

    if include_values:
        if dense:
            assert summary[0][2:7] == expected_result[0][2:7]
            assert summary[1][2:7] == expected_result[1][2:7]
        else:
            assert summary == expected_result
    else:
        if dense:
            assert summary[2:7] == expected_result[0][2:7]
        else:
            assert summary == expected_result[0]
Exemplo n.º 5
0
def test_summarize_embedding_string(dense, include_values):
    vect = NgramVectorizer()
    weight_matrix = vect.fit_transform(text_token_data)
    if dense:
        weight_matrix = weight_matrix.todense()
    summary = summarize_embedding(
        weight_matrix,
        vect.column_index_dictionary_,
        k=2,
        return_type="string",
        include_values=include_values,
    )
    if include_values:
        expected_result = [
            "foo:2.0,wer:1.0",
            "",
            "bar:4.0,foo:3.0",
            "wer:2.0,foo:2.0",
            "bar:4.0,foo:3.0",
            "wer:3.0,pok:3.0",
            "wer:4.0,foo:4.0",
        ]
    else:
        expected_result = [
            "foo,wer",
            "",
            "bar,foo",
            "wer,foo",
            "bar,foo",
            "wer,pok",
            "wer,foo",
        ]
    if dense:
        assert summary[2:7] == expected_result[2:7]
    else:
        assert summary == expected_result
Exemplo n.º 6
0
def test_ngram_vectorizer_max_doc_freq():
    vectorizer = NgramVectorizer(max_document_frequency=0.4)
    count_matrix = vectorizer.fit_transform(text_token_data_permutation)
    assert count_matrix.shape == (3, 2)
    assert np.all(count_matrix.toarray() == np.array([[0, 0], [1, 0], [0, 1]]))
Exemplo n.º 7
0
def test_ngram_vectorizer_min_doc():
    vectorizer = NgramVectorizer(min_document_occurrences=2)
    count_matrix = vectorizer.fit_transform(text_token_data_permutation)
    assert count_matrix.shape == (3, 2)
    assert np.all(count_matrix.toarray() == np.array([[1, 1], [1, 0], [1, 1]]))
Exemplo n.º 8
0
def test_ngram_vectorizer_text():
    vectorizer = NgramVectorizer(ngram_size=2, ngram_behaviour='subgrams')
    result = vectorizer.fit_transform(text_token_data)
    assert scipy.sparse.issparse(result)
    # Ensure that the empty document has an all zero row
    assert len((result[1, :]).data) == 0