def test_basic(self): test_cases = ( (0, "", ""), (0, "same string", "same string"), (1, "a", "b"), (1, "12345", "123 45"), (1, "the fall", "the falll"), (2, "rolling stones", "roling stone"), (5, "", "12345"), (2, "1234", "123456"), (2, "2345", "123456"), (2, "3456", "123456"), (3, "abc", "xyz"), (3, "kitten", "sitting"), (3, "saturday", "sunday"), (9, "VERY", "different")) for expected_dist, string_1, string_2 in test_cases: dist = similarity.get_levenshtein_distance(string_1, string_2) self.assertEqual(expected_dist, dist, msg="(%s, %s)" % (string_1, string_2)) dist = similarity.get_levenshtein_distance(string_2, string_1) self.assertEqual(expected_dist, dist, msg="(%s, %s)" % (string_2, string_1)) # Test setting max_value for many different values. for max_value in xrange(1, expected_dist+2): clamped_dist = similarity.get_levenshtein_distance( string_1, string_2, max_value=max_value) self.assertEqual(min(expected_dist, max_value), clamped_dist)
def test_basic(self): test_cases = ((0, "", ""), (0, "same string", "same string"), (1, "a", "b"), (1, "12345", "123 45"), (1, "the fall", "the falll"), (2, "rolling stones", "roling stone"), (5, "", "12345"), (2, "1234", "123456"), (2, "2345", "123456"), (2, "3456", "123456"), (3, "abc", "xyz"), (3, "kitten", "sitting"), (3, "saturday", "sunday"), (9, "VERY", "different")) for expected_dist, string_1, string_2 in test_cases: dist = similarity.get_levenshtein_distance(string_1, string_2) self.assertEqual(expected_dist, dist, msg="(%s, %s)" % (string_1, string_2)) dist = similarity.get_levenshtein_distance(string_2, string_1) self.assertEqual(expected_dist, dist, msg="(%s, %s)" % (string_2, string_1)) # Test setting max_value for many different values. for max_value in xrange(1, expected_dist + 2): clamped_dist = similarity.get_levenshtein_distance( string_1, string_2, max_value=max_value) self.assertEqual(min(expected_dist, max_value), clamped_dist)
def suggest(name): canon_name = similarity.canonicalize_string(name) _global_lock.acquire() try: canon_whitelist = list(_global_whitelist) finally: _global_lock.release() best_guess = None # We ignore any items that are more than 10 edits away from our # original name. MAX_DIST = 10 MAX_NORM_DIST = 0.25 best_dist = 1e+100 for guess in canon_whitelist: normalizer = (len(guess)+len(canon_name)/2.0) max_value = min(MAX_DIST, int(1+normalizer*MAX_NORM_DIST)) lev_dist = similarity.get_levenshtein_distance( canon_name, guess, max_value=max_value) if lev_dist < MAX_DIST: normalized_lev_dist = lev_dist / normalizer if normalized_lev_dist < MAX_NORM_DIST: best_guess = guess best_dist = normalized_lev_dist return _global_whitelist.get(best_guess)