예제 #1
0
def test_token_cooccurrence_vectorizer_text():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data)
    assert scipy.sparse.issparse(result)
    transform = vectorizer.transform(text_token_data)
    assert (result != transform).nnz == 0
    vectorizer = TokenCooccurrenceVectorizer(window_radius=1,
                                             window_orientation="after")
    result = vectorizer.fit_transform(text_token_data)
    transform = vectorizer.transform(text_token_data)
    assert (result != transform).nnz == 0
    assert result[1, 2] == 8
    assert result[0, 1] == 6
예제 #2
0
def test_equality_of_CooccurrenceVectorizers(
    min_token_occurrences,
    max_token_occurrences,
    min_document_occurrences,
    max_document_frequency,
    window_radius,
    window_orientation,
    kernel_function,
    mask_string,
):
    tree_model = LabelledTreeCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_occurrences=max_token_occurrences,
        max_tree_frequency=max_document_frequency,
        min_tree_occurrences=min_document_occurrences,
        mask_string=mask_string,
    )
    seq_model = TokenCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_occurrences=max_token_occurrences,
        max_document_frequency=max_document_frequency,
        min_document_occurrences=min_document_occurrences,
        mask_string=mask_string,
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        seq_model.fit_transform(text_token_data_permutation).toarray(),
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        tree_model.transform(seq_tree_sequence).toarray(),
    )
    assert np.allclose(
        seq_model.fit_transform(text_token_data_permutation).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
    assert np.allclose(
        tree_model.transform(seq_tree_sequence).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
예제 #3
0
def test_equality_of_CooccurrenceVectorizers(
    n_iter,
    normalize_windows,
    kernel_function,
    n_threads,
):
    window_radius = [1, 3]
    window_function = ["fixed", "variable"]

    model1 = TokenCooccurrenceVectorizer(
        window_radii=window_radius,
        n_iter=n_iter,
        kernel_functions=kernel_function,
        window_functions=window_function,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    model2 = TimedTokenCooccurrenceVectorizer(
        window_radii=window_radius,
        kernel_functions=kernel_function,
        window_functions=window_function,
        n_iter=n_iter,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    model3 = MultiSetCooccurrenceVectorizer(
        window_radii=window_radius,
        kernel_functions=kernel_function,
        window_functions=window_function,
        n_iter=n_iter,
        normalize_windows=normalize_windows,
        n_threads=n_threads,
    )
    base_result = model1.fit_transform(tiny_token_data).toarray()
    assert np.allclose(
        base_result,
        model2.fit_transform(timed_tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model3.fit_transform(tiny_multi_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model1.transform(tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model2.transform(timed_tiny_token_data).toarray(),
    )
    assert np.allclose(
        base_result,
        model3.transform(tiny_multi_token_data).toarray(),
    )
예제 #4
0
def test_equality_of_Tree_and_Token_CooccurrenceVectorizers(
    min_token_occurrences,
    max_document_frequency,
    window_radius,
    window_orientation,
    kernel_function,
    mask_string,
    nullify_mask,
):
    tree_model = LabelledTreeCooccurrenceVectorizer(
        window_radius=window_radius,
        window_orientation=window_orientation,
        kernel_function=kernel_function,
        min_occurrences=min_token_occurrences,
        max_tree_frequency=max_document_frequency,
        mask_string=mask_string,
        nullify_mask=nullify_mask and not mask_string is None,
    )
    seq_model = TokenCooccurrenceVectorizer(
        window_radii=window_radius,
        window_orientations=window_orientation,
        kernel_functions=kernel_function,
        min_occurrences=min_token_occurrences,
        max_document_frequency=max_document_frequency,
        mask_string=mask_string,
        normalize_windows=False,
        nullify_mask=nullify_mask and not mask_string is None,
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        seq_model.fit_transform(text_token_data_permutation).toarray(),
    )
    assert np.allclose(
        tree_model.fit_transform(seq_tree_sequence).toarray(),
        tree_model.transform(seq_tree_sequence).toarray(),
    )
    assert np.allclose(
        seq_model.fit_transform(text_token_data_permutation).toarray(),
        seq_model.transform(text_token_data_permutation).toarray(),
    )
예제 #5
0
def test_token_cooccurence_vectorizer_transform_new_vocab():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data_new_token)
    assert (result != transform).nnz == 0
예제 #6
0
def test_token_cooccurrence_vectorizer_transform():
    vectorizer = TokenCooccurrenceVectorizer(window_orientation='symmetric')
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data)
    assert result.shape == transform.shape
    assert transform[0, 0] == 34
예제 #7
0
def test_token_cooccurrence_vectorizer_transform():
    vectorizer = TokenCooccurrenceVectorizer()
    result = vectorizer.fit_transform(text_token_data_subset)
    transform = vectorizer.transform(text_token_data)
    assert result.shape == transform.shape
    assert transform[0, 0] == 34