def test_ngram_vectorizer_basic(): vectorizer = NgramVectorizer() result = vectorizer.fit_transform(token_data) assert scipy.sparse.issparse(result) transform_result = vectorizer.transform(token_data) assert np.all(transform_result.data == result.data) assert np.all(transform_result.tocoo().col == result.tocoo().col)
def test_bpe_tokens_ngram_matches(): bpe1 = BytePairEncodingVectorizer(return_type="matrix") bpe2 = BytePairEncodingVectorizer(return_type="tokens") result1 = bpe1.fit_transform(raw_string_data) token_dictionary = { to_unicode(code, bpe1.tokens_, bpe1.max_char_code_): n for code, n in bpe1.column_label_dictionary_.items() } tokens = bpe2.fit_transform(raw_string_data) result2 = NgramVectorizer(token_dictionary=token_dictionary).fit_transform(tokens) assert np.allclose(result1.toarray(), result2.toarray())
def __init__( self, tokenizer=NLTKTokenizer(), ngram_vectorizer=NgramVectorizer(ngram_size=1), info_weight_transformer=InformationWeightTransformer(), remove_effects_transformer=RemoveEffectsTransformer(), ): """ A class for converting documents into a fixed width representation. Useful for comparing documents with each other. This is done via: 1) Tokenization defaults to NLTK but can use stanza, spacy or a custom tokenizer. 2) Converts this sequence of tokens into counts of n-grams (default 1-grams). 3) Re-weights counts based on how informative the presence of an n-gram is within a document. 4) Build a low rank model for how often we'd expect a completely random n-gram to occur your text and correct for this effect. Parameters ---------- tokenizer = textmap.tokenizers.BaseTokenizer (default NLTKTokenizer) Takes an instantiation of a class that inherits from BaseTokenizer. These are classes which take documents are parse them into individual tokens, then optionally contract frequently co-occuring tokens together into a single token. Examples of such tokenizers can be found in textmap.tokenizers and include: 1) NLTKTokenizer 2) NLTKTweetTokenizer 3) SKLearnTokenizer 4) StanzaTokenizer 5) SpaCyTokenizer ngram_vectorizer = vectorizer.NgramVectorizer (default NgramVectorizer(ngram_size=1)) Takes an instance of a class which turns sequences of sequences of tokens into fixed width representation through counting the occurence of n-grams. In the default case this simply counts the number of occurrences of each token. This class returns a documents by n-gram sparse matrix of counts. info_weight_transformer = textmap.transformers.InformationWeightTransformer (default InformationWeightTransformer()) Takes an instance of a class which re-weights the counts in a sparse matrix. It does this by building a low rank model of the probability of a word being contained in any document, converting that into information by applying a log and scaling our counts by this value. If this is set to None this step is skipped in the pipeline. remove_effect_transformer = textmap.transformer.RemoveEffectsTranformer (default RemoveEffectsTransformer()) Takes an instance of a class which builds a low rank model for how often we'd expect a completely random word to occur your text and correct for this effect. If this is set to None this step is skipped in the pipeline. """ self.tokenizer = tokenizer self.ngram_vectorizer = ngram_vectorizer # These are more minor. I'd be willing to default them to a string to clean # up the docstring help. self.info_weight_transformer = info_weight_transformer self.remove_effects_transformer = remove_effects_transformer
def test_summarize_embedding_list(dense, include_values): vect = NgramVectorizer() weight_matrix = vect.fit_transform(text_token_data) if dense: weight_matrix = weight_matrix.todense() summary = summarize_embedding( weight_matrix, vect.column_index_dictionary_, include_values=include_values ) expected_result = ( [ ["foo", "wer", "pok"], [], ["bar", "foo", "wer"], ["wer", "foo", "bar"], ["bar", "foo", "wer"], ["wer", "pok", "foo"], ["wer", "foo", "pok"], ], [ [2.0, 1.0, 1.0], [], [4.0, 3.0, 2.0], [2.0, 2.0, 2.0], [4.0, 3.0, 2.0], [3.0, 3.0, 3.0], [4.0, 4.0, 2.0], ], ) if include_values: if dense: assert summary[0][2:7] == expected_result[0][2:7] assert summary[1][2:7] == expected_result[1][2:7] else: assert summary == expected_result else: if dense: assert summary[2:7] == expected_result[0][2:7] else: assert summary == expected_result[0]
def test_summarize_embedding_string(dense, include_values): vect = NgramVectorizer() weight_matrix = vect.fit_transform(text_token_data) if dense: weight_matrix = weight_matrix.todense() summary = summarize_embedding( weight_matrix, vect.column_index_dictionary_, k=2, return_type="string", include_values=include_values, ) if include_values: expected_result = [ "foo:2.0,wer:1.0", "", "bar:4.0,foo:3.0", "wer:2.0,foo:2.0", "bar:4.0,foo:3.0", "wer:3.0,pok:3.0", "wer:4.0,foo:4.0", ] else: expected_result = [ "foo,wer", "", "bar,foo", "wer,foo", "bar,foo", "wer,pok", "wer,foo", ] if dense: assert summary[2:7] == expected_result[2:7] else: assert summary == expected_result
def test_ngram_vectorizer_max_doc_freq(): vectorizer = NgramVectorizer(max_document_frequency=0.4) count_matrix = vectorizer.fit_transform(text_token_data_permutation) assert count_matrix.shape == (3, 2) assert np.all(count_matrix.toarray() == np.array([[0, 0], [1, 0], [0, 1]]))
def test_ngram_vectorizer_min_doc(): vectorizer = NgramVectorizer(min_document_occurrences=2) count_matrix = vectorizer.fit_transform(text_token_data_permutation) assert count_matrix.shape == (3, 2) assert np.all(count_matrix.toarray() == np.array([[1, 1], [1, 0], [1, 1]]))
def test_ngram_vectorizer_text(): vectorizer = NgramVectorizer(ngram_size=2, ngram_behaviour='subgrams') result = vectorizer.fit_transform(text_token_data) assert scipy.sparse.issparse(result) # Ensure that the empty document has an all zero row assert len((result[1, :]).data) == 0