def test_compare_implementations():
    # Compare the implementations of python-Levenshtein to our
    # pure-Python implementations
    if Levenshtein is False:
        raise unittest.SkipTest
    # Test on strings with randomly placed common char
    for string1, string2 in _random_common_char_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=False) == Levenshtein.jaro(string1, string2))
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=True) == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(
            string1, string2) == Levenshtein.ratio(string1, string2))
    # Test on random strings
    for string1, string2 in _random_string_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=False) == Levenshtein.jaro(string1, string2))
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=True) == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(
            string1, string2) == Levenshtein.ratio(string1, string2))
Exemplo n.º 2
0
def test_compare_implementations():
    # Compare the implementations of python-Levenshtein to our
    # pure-Python implementations
    if Levenshtein is False:
        raise unittest.SkipTest
    for string1, string2 in _random_string_pairs(n_pairs=10):
        assert (string_distances._jaro_winkler(
            string1, string2,
            winkler=False) == Levenshtein.jaro(string1, string2))
        assert (string_distances._jaro_winkler(
            string1, string2) == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(
            string1, string2) == Levenshtein.ratio(string1, string2))
Exemplo n.º 3
0
def test_similarity_encoder():
    X = np.array(['aa', 'aaa', 'aaab']).reshape(-1, 1)
    X_test = np.array([['Aa', 'aAa', 'aaa', 'aaab', ' aaa  c']]).reshape(-1, 1)

    similarities = ['levenshtein-ratio', 'jaro-winkler', 'jaro', 'ngram']

    for similarity in similarities:
        model = similarity_encoder.SimilarityEncoder(similarity=similarity,
                                                     handle_unknown='ignore')

        encoder = model.fit(X).transform(X_test)

        if similarity == 'levenshtein-ratio':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.levenshtein_ratio(x_t, x)
            assert np.array_equal(encoder, ans)

        if similarity == 'jaro-winkler':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.jaro_winkler(x_t, x)
            assert np.array_equal(encoder, ans)

        if similarity == 'jaro':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.jaro(x_t, x)
            assert np.array_equal(encoder, ans)

        if similarity == 'ngram':
            ans = np.zeros((len(X_test), len(X)))
            for i, x_t in enumerate(X_test.reshape(-1)):
                for j, x in enumerate(X.reshape(-1)):
                    ans[i, j] = string_distances.ngram_similarity(x_t, x, 3)
            assert np.array_equal(encoder, ans)
def test_identical_strings():
    # Test that if 2 strings are the same, the similarity
    for string1, _ in _random_string_pairs(n_pairs=10):
        assert string_distances.jaro(string1, string1) == 1
        assert string_distances.jaro_winkler(string1, string1) == 1
        assert string_distances.levenshtein_ratio(string1, string1) == 1