def test_token_cooccurrence_vectorizer_text(): vectorizer = TokenCooccurrenceVectorizer() result = vectorizer.fit_transform(text_token_data) assert scipy.sparse.issparse(result) transform = vectorizer.transform(text_token_data) assert (result != transform).nnz == 0 vectorizer = TokenCooccurrenceVectorizer(window_radius=1, window_orientation="after") result = vectorizer.fit_transform(text_token_data) transform = vectorizer.transform(text_token_data) assert (result != transform).nnz == 0 assert result[1, 2] == 8 assert result[0, 1] == 6
def test_equality_of_CooccurrenceVectorizers( min_token_occurrences, max_token_occurrences, min_document_occurrences, max_document_frequency, window_radius, window_orientation, kernel_function, mask_string, ): tree_model = LabelledTreeCooccurrenceVectorizer( window_radius=window_radius, window_orientation=window_orientation, kernel_function=kernel_function, min_occurrences=min_token_occurrences, max_occurrences=max_token_occurrences, max_tree_frequency=max_document_frequency, min_tree_occurrences=min_document_occurrences, mask_string=mask_string, ) seq_model = TokenCooccurrenceVectorizer( window_radius=window_radius, window_orientation=window_orientation, kernel_function=kernel_function, min_occurrences=min_token_occurrences, max_occurrences=max_token_occurrences, max_document_frequency=max_document_frequency, min_document_occurrences=min_document_occurrences, mask_string=mask_string, ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), seq_model.fit_transform(text_token_data_permutation).toarray(), ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), tree_model.transform(seq_tree_sequence).toarray(), ) assert np.allclose( seq_model.fit_transform(text_token_data_permutation).toarray(), seq_model.transform(text_token_data_permutation).toarray(), ) assert np.allclose( tree_model.transform(seq_tree_sequence).toarray(), seq_model.transform(text_token_data_permutation).toarray(), )
def test_equality_of_CooccurrenceVectorizers( n_iter, normalize_windows, kernel_function, n_threads, ): window_radius = [1, 3] window_function = ["fixed", "variable"] model1 = TokenCooccurrenceVectorizer( window_radii=window_radius, n_iter=n_iter, kernel_functions=kernel_function, window_functions=window_function, normalize_windows=normalize_windows, n_threads=n_threads, ) model2 = TimedTokenCooccurrenceVectorizer( window_radii=window_radius, kernel_functions=kernel_function, window_functions=window_function, n_iter=n_iter, normalize_windows=normalize_windows, n_threads=n_threads, ) model3 = MultiSetCooccurrenceVectorizer( window_radii=window_radius, kernel_functions=kernel_function, window_functions=window_function, n_iter=n_iter, normalize_windows=normalize_windows, n_threads=n_threads, ) base_result = model1.fit_transform(tiny_token_data).toarray() assert np.allclose( base_result, model2.fit_transform(timed_tiny_token_data).toarray(), ) assert np.allclose( base_result, model3.fit_transform(tiny_multi_token_data).toarray(), ) assert np.allclose( base_result, model1.transform(tiny_token_data).toarray(), ) assert np.allclose( base_result, model2.transform(timed_tiny_token_data).toarray(), ) assert np.allclose( base_result, model3.transform(tiny_multi_token_data).toarray(), )
def test_equality_of_Tree_and_Token_CooccurrenceVectorizers( min_token_occurrences, max_document_frequency, window_radius, window_orientation, kernel_function, mask_string, nullify_mask, ): tree_model = LabelledTreeCooccurrenceVectorizer( window_radius=window_radius, window_orientation=window_orientation, kernel_function=kernel_function, min_occurrences=min_token_occurrences, max_tree_frequency=max_document_frequency, mask_string=mask_string, nullify_mask=nullify_mask and not mask_string is None, ) seq_model = TokenCooccurrenceVectorizer( window_radii=window_radius, window_orientations=window_orientation, kernel_functions=kernel_function, min_occurrences=min_token_occurrences, max_document_frequency=max_document_frequency, mask_string=mask_string, normalize_windows=False, nullify_mask=nullify_mask and not mask_string is None, ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), seq_model.fit_transform(text_token_data_permutation).toarray(), ) assert np.allclose( tree_model.fit_transform(seq_tree_sequence).toarray(), tree_model.transform(seq_tree_sequence).toarray(), ) assert np.allclose( seq_model.fit_transform(text_token_data_permutation).toarray(), seq_model.transform(text_token_data_permutation).toarray(), )
def test_token_cooccurence_vectorizer_transform_new_vocab(): vectorizer = TokenCooccurrenceVectorizer() result = vectorizer.fit_transform(text_token_data_subset) transform = vectorizer.transform(text_token_data_new_token) assert (result != transform).nnz == 0
def test_token_cooccurrence_vectorizer_transform(): vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric') result = vectorizer.fit_transform(text_token_data_subset) transform = vectorizer.transform(text_token_data) assert result.shape == transform.shape assert transform[0, 0] == 34
def test_token_cooccurrence_vectorizer_transform(): vectorizer = TokenCooccurrenceVectorizer() result = vectorizer.fit_transform(text_token_data_subset) transform = vectorizer.transform(text_token_data) assert result.shape == transform.shape assert transform[0, 0] == 34