示例#1
0
def _test_similarity(similarity, similarity_f, hashing_dim=None, categories='auto', n_prototypes=None):
    if n_prototypes is None:
        X = np.array(['aa', 'aaa', 'aaab']).reshape(-1, 1)
        X_test = np.array([['Aa', 'aAa', 'aaa', 'aaab', ' aaa  c']]).reshape(-1, 1)

        model = similarity_encoder.SimilarityEncoder(
            similarity=similarity, hashing_dim=hashing_dim, categories=categories,
            n_prototypes=n_prototypes)

        if similarity == 'ngram':
            model.ngram_range = (3, 3)

        encoder = model.fit(X).transform(X_test)

        ans = np.zeros((len(X_test), len(X)))
        for i, x_t in enumerate(X_test.reshape(-1)):
            for j, x in enumerate(X.reshape(-1)):
                if similarity == 'ngram':
                    ans[i, j] = similarity_f(x_t, x, 3)
                else:
                    ans[i, j] = similarity_f(x_t, x)
        numpy.testing.assert_almost_equal(encoder, ans)
    else:
        X = np.array(
            ['aac', 'aaa', 'aaab', 'aaa', 'aaab', 'aaa', 'aaab', 'aaa']
        ).reshape(-1, 1)
        X_test = np.array([['Aa', 'aAa', 'aaa', 'aaab', ' aaa  c']]
                          ).reshape(-1, 1)

        try:
            model = similarity_encoder.SimilarityEncoder(
                similarity=similarity, hashing_dim=hashing_dim, categories=categories,
                n_prototypes=n_prototypes, random_state=42)
        except ValueError as e:
            assert (e.__str__() == 'n_prototypes expected None or a positive non null integer')
            return

        if similarity == 'ngram':
            model.ngram_range = (3, 3)

        encoder = model.fit(X).transform(X_test)
        if n_prototypes == 1:
            assert (model.categories_ == ['aaa'])
        elif n_prototypes == 2:
            a = [np.array(['aaa', 'aaab'], dtype='<U4')]
            assert (np.array_equal(a, model.categories_))
        elif n_prototypes == 3:
            a = [np.array(['aaa', 'aaab', 'aac'], dtype='<U4')]
            assert (np.array_equal(a, model.categories_))

        ans = np.zeros((len(X_test),
                        len(np.array(model.categories_).reshape(-1))))
        for i, x_t in enumerate(X_test.reshape(-1)):
            for j, x in enumerate(np.array(model.categories_).reshape(-1)):
                if similarity == 'ngram':
                    ans[i, j] = similarity_f(x_t, x, 3)
                else:
                    ans[i, j] = similarity_f(x_t, x)

        numpy.testing.assert_almost_equal(encoder, ans)
示例#2
0
def test_specifying_categories():
    # When creating a new SimilarityEncoder:
    # - if categories = 'auto', the categories are the sorted, unique training
    # set observations (for each column)
    # - if categories is a list (of lists), the categories for each column are
    # each item in the list

    # In this test, we first find the sorted, unique categories in the training
    # set, and create a SimilarityEncoder by giving it explicitly the computed
    # categories. The test consists in making sure the transformed observations
    # given by this encoder are equal to the transformed obervations in the
    # case of a SimilarityEncoder created with categories = 'auto'

    observations = [['bar'], ['foo']]
    categories = [['bar', 'foo']]

    sim_enc_with_cat = similarity_encoder.SimilarityEncoder(
        categories=categories, ngram_range=(2, 3), similarity='ngram')
    sim_enc_auto_cat = similarity_encoder.SimilarityEncoder(ngram_range=(2, 3),
                                                            similarity='ngram')

    feature_matrix_with_cat = sim_enc_with_cat.fit_transform(observations)
    feature_matrix_auto_cat = sim_enc_auto_cat.fit_transform(observations)

    assert np.allclose(feature_matrix_auto_cat, feature_matrix_with_cat)
示例#3
0
def _test_missing_values_transform(input_type, missing):
    observations = [['a', 'b'], ['b', 'a'], ['b', 'b'], ['a', 'c'], ['c', 'a']]
    test_observations = [['a', 'b'], ['b', 'a'], ['b', np.nan], ['a', 'c'],
                         [np.nan, 'a']]
    encoded = np.array([[1., 0., 0., 0., 1., 0.], [0., 1., 0., 1., 0., 0.],
                        [0., 1., 0., 0., 0., 0.], [1., 0., 0., 0., 0., 1.],
                        [0., 0., 0., 1., 0., 0.]])

    if input_type == 'numpy':
        test_observations = np.array(test_observations, dtype=object)
    elif input_type == 'pandas':
        pd = pytest.importorskip("pandas")
        test_observations = pd.DataFrame(test_observations)

    sim_enc = similarity_encoder.SimilarityEncoder(handle_missing=missing)
    if missing == 'error':
        sim_enc.fit_transform(observations)
        with pytest.raises(ValueError,
                           match=r"Found missing values in input "
                           "data; set handle_missing='' to encode "
                           "with missing values"):
            sim_enc.transform(test_observations)
    elif missing == '':
        sim_enc.fit_transform(observations)
        ans = sim_enc.transform(test_observations)
        assert np.allclose(encoded, ans)
示例#4
0
def test_reproducibility():
    sim_enc = similarity_encoder.SimilarityEncoder(categories='k-means',
                                                   n_prototypes=10,
                                                   random_state=435)
    X = np.array([' %s ' % chr(i) for i in range(32, 127)]).reshape((-1, 1))
    prototypes = sim_enc.fit(X).categories_[0]
    for i in range(10):
        assert (np.array_equal(prototypes, sim_enc.fit(X).categories_[0]))
示例#5
0
def test_fast_ngram_similarity():
    vocabulary = [['bar', 'foo']]
    observations = [['foo'], ['baz']]

    sim_enc = similarity_encoder.SimilarityEncoder(
        similarity='ngram', ngram_range=(2, 2), categories=vocabulary)

    sim_enc.fit(observations)
    feature_matrix = sim_enc.transform(observations, fast=False)
    feature_matrix_fast = sim_enc.transform(observations, fast=True)

    assert np.allclose(feature_matrix, feature_matrix_fast)
示例#6
0
def test_get_features():
    # See https://github.com/dirty-cat/dirty_cat/issues/168
    sim_enc = similarity_encoder.SimilarityEncoder(random_state=435)
    X = np.array(['%s' % chr(i) for i in range(32, 127)]).reshape((-1, 1))
    sim_enc.fit(X)
    feature_names = sim_enc.get_feature_names()
    assert feature_names.tolist() == [
        'x0_ ', 'x0_!', 'x0_"', 'x0_#', 'x0_$', 'x0_%', 'x0_&', "x0_'", 'x0_(',
        'x0_)', 'x0_*', 'x0_+', 'x0_,', 'x0_-', 'x0_.', 'x0_/', 'x0_0', 'x0_1',
        'x0_2', 'x0_3', 'x0_4', 'x0_5', 'x0_6', 'x0_7', 'x0_8', 'x0_9', 'x0_:',
        'x0_;', 'x0_<', 'x0_=', 'x0_>', 'x0_?', 'x0_@', 'x0_A', 'x0_B', 'x0_C',
        'x0_D', 'x0_E', 'x0_F', 'x0_G', 'x0_H', 'x0_I', 'x0_J', 'x0_K', 'x0_L',
        'x0_M', 'x0_N', 'x0_O', 'x0_P', 'x0_Q', 'x0_R', 'x0_S', 'x0_T', 'x0_U',
        'x0_V', 'x0_W', 'x0_X', 'x0_Y', 'x0_Z', 'x0_[', 'x0_\\', 'x0_]',
        'x0_^', 'x0__', 'x0_`', 'x0_a', 'x0_b', 'x0_c', 'x0_d', 'x0_e', 'x0_f',
        'x0_g', 'x0_h', 'x0_i', 'x0_j', 'x0_k', 'x0_l', 'x0_m', 'x0_n', 'x0_o',
        'x0_p', 'x0_q', 'x0_r', 'x0_s', 'x0_t', 'x0_u', 'x0_v', 'x0_w', 'x0_x',
        'x0_y', 'x0_z', 'x0_{', 'x0_|', 'x0_}', 'x0_~'
    ]
示例#7
0
def _test_similarity(similarity,
                     similarity_f,
                     hashing_dim=None,
                     categories='auto'):
    X = np.array(['aa', 'aaa', 'aaab']).reshape(-1, 1)
    X_test = np.array([['Aa', 'aAa', 'aaa', 'aaab', ' aaa  c']]).reshape(-1, 1)

    model = similarity_encoder.SimilarityEncoder(similarity=similarity,
                                                 handle_unknown='ignore',
                                                 hashing_dim=hashing_dim,
                                                 categories=categories)

    encoder = model.fit(X).transform(X_test)

    ans = np.zeros((len(X_test), len(X)))
    for i, x_t in enumerate(X_test.reshape(-1)):
        for j, x in enumerate(X.reshape(-1)):
            if similarity == 'ngram':
                ans[i, j] = similarity_f(x_t, x, 3)
            else:
                ans[i, j] = similarity_f(x_t, x)
    assert np.array_equal(encoder, ans)
示例#8
0
def test_similarity_encoder():
    X = np.array(['aa', 'aaa', 'aaab']).reshape(-1, 1)
    X_test = np.array([['Aa', 'aAa', 'aaa', 'aaab', ' aaa  c']]).reshape(-1, 1)

    similarities = ['levenshtein-ratio', 'jaro-winkler', 'jaro', 'ngram']

    for similarity in similarities:
        model = similarity_encoder.SimilarityEncoder(similarity=similarity,
                                                     handle_unknown='ignore')

        encoder = model.fit(X).transform(X_test)

        if similarity == 'levenshtein-ratio':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.levenshtein_ratio(x_t, x)
            assert np.array_equal(encoder, ans)

        if similarity == 'jaro-winkler':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.jaro_winkler(x_t, x)
            assert np.array_equal(encoder, ans)

        if similarity == 'jaro':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.jaro(x_t, x)
            assert np.array_equal(encoder, ans)

        if similarity == 'ngram':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.ngram_similarity(x_t, x, 3)
            assert np.array_equal(encoder, ans)