def hybrid_jaccard_similarity(set1, set2, threshold=0.5, function=jaro_winkler_similarity, parameters={}): utils.check_for_none(set1, set2) utils.check_for_type(set, set1, set2) matching_score = [] for s1 in set1: inner = [] for s2 in set2: score = function(s1, s2, **parameters) if score < threshold: score = 0.0 inner.append(1.0 - score) # munkres finds out the smallest element matching_score.append(inner) indexes = munkres.Munkres().compute(matching_score) score_sum, matching_count = 0.0, 0 for r, c in indexes: matching_count += 1 score_sum += 1.0 - matching_score[r][c] # go back to similarity if len(set1) + len(set2) - matching_count == 0: return 1.0 return float(score_sum) / float(len(set1) + len(set2) - matching_count)
def hamming_distance(s1, s2): """ Hamming distance used to measure the minimum number of substitutions required to change one sequence into the other. Args: s1 (str or list): Sequence 1. s2 (str or list): Sequence 2. Returns: int: Hamming distance between two sequences. Examples: >>> rltk.hamming_distance('ab','cd') 2 >>> rltk.hamming_distance([1,2,3],[3,2,3]) 1 """ utils.check_for_none(s1, s2) # utils.check_for_type(str, s1, s2) if len(s1) != len(s2): raise ValueError('Unequal length') return sum(c1 != c2 for c1, c2 in zip(s1, s2))
def needleman_wunsch_score(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table=None): """ Neeldman Wunsch score """ utils.check_for_none(s1, s2) utils.check_for_type(str, s1, s2) score_table = score_table if isinstance(score_table, dict) else {} # s1 = utils.unicode_normalize(s1) # s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) if n1 == 0 and n2 == 0: return 0 # construct matrix to get max score of all possible alignments dp = [[0] * (n2 + 1) for _ in range(n1 + 1)] for i in range(n1 + 1): for j in range(n2 + 1): if i == 0 and j == 0: # [0,0] continue elif i == 0: # most top row dp[i][j] = gap + dp[i][j - 1] elif j == 0: # most left column dp[i][j] = gap + dp[i - 1][j] else: dp[i][j] = max(dp[i][j - 1] + gap, dp[i - 1][j] + gap, dp[i - 1][j - 1] + _get_score(s1[i - 1], s2[j - 1], match, mismatch, score_table)) return dp[n1][n2]
def string_cosine_similarity(bag1, bag2): """ The similarity between the two strings is the cosine of the angle between these two vectors representation. Args: bag1 (list): Bag1, tokenized string sequence. bag2 (list): Bag2, tokenized string sequence. Returns: float: Cosine similarity. """ utils.check_for_none(bag1, bag2) utils.check_for_type(list, bag1, bag2) d1 = collections.Counter(bag1) d2 = collections.Counter(bag2) intersection = set(d1.keys()) & set(d2.keys()) v_x_y = sum([d1[x] * d2[x] for x in intersection]) v_x_2 = sum([v * v for k, v in d1.items()]) v_y_2 = sum([v * v for k, v in d2.items()]) return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
def monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters={}): """ Monge Elkan similarity. Args: bag1 (list): Bag 1. bag2 (list): Bag 2. function (function, optional): The reference of a similarity measure function. \ It should return the value in range [0,1]. If it is set to None, \ `jaro_winlker_similarity` will be used. parameters (dict, optional): Other parameters of function. Defaults to empty dict. Returns: float: Monge Elkan similarity. """ utils.check_for_none(bag1, bag2) utils.check_for_type(list, bag1, bag2) if len(bag1) == 0: return 0.0 score_sum = 0 for ele1 in bag1: max_score = MIN_FLOAT for ele2 in bag2: max_score = max(max_score, function(ele1, ele2, **parameters)) score_sum += max_score return float(score_sum) / float(len(bag1))
def metric_longest_common_subsequence(s1, s2): """ The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|) Args: s1 (str): Sequence 1. s2 (str): Sequence 2. Returns: float: Metric Longest Common Subsequence Distance. Examples: >>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL') 0.4 # LCS: ABCDEF => length = 6 # longest = s2 => length = 10 # => 1 - 6/10 = 0.4 >>> rltk.optimal_string_alignment_distance('ABDEF', 'ABDIF') 4 # LCS: ABDF => length = 4 # longest = ABDEF => length = 5 # => 1 - 4 / 5 = 0.2 """ utils.check_for_none(s1, s2) utils.check_for_type(basestring, s1, s2) lcs = _lcs(s1, s2) return 1 - float(lcs) / max(len(s1), len(s2), 1)
def longest_common_subsequence_distance(s1, s2): """ The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m Args: s1 (str): Sequence 1. s2 (str): Sequence 2. Returns: float: Longest Common Subsequence Distance. Examples: >>> rltk.longest_common_subsequence_distance('abcd', 'acbd') 2 >>> rltk.longest_common_subsequence_distance('abcdefg', 'acef') 3 """ utils.check_for_none(s1, s2) utils.check_for_type(basestring, s1, s2) m, n = len(s1), len(s2) dp = [[None] * (n + 1) for i in xrange(m + 1)] lcs = _lcs(s1, s2) return n + m - 2 * lcs
def cosine_similarity(vec1, vec2): """ The cosine similarity between to vectors. Args: vec1 (list): Vector 1. List of integer or float. vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1. Returns: float: Cosine similarity. Examples: >>> rltk.cosine_similarity([1, 2, 1, 3], [2, 5, 2, 3]) 0.91634193 """ utils.check_for_none(vec1, vec2) utils.check_for_type(list, vec1, vec2) if len(vec1) != len(vec2): raise ValueError('vec1 and vec2 should have same length') v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0 for v1, v2 in zip(vec1, vec2): # list of int / float v_x_y += v1 * v2 v_x_2 += v1 * v1 v_y_2 += v2 * v2 return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
def soundex(s): """ The standard used for this implementation is provided by `U.S. Census Bureau <https://www.archives.gov/research/census/soundex.html>`_. Args: s (str): Sequence. Returns: str: Coded sequence. Examples: >>> rltk.soundex('ashcraft') 'A261' >>> rltk.soundex('pineapple') 'P514' """ utils.check_for_none(s) utils.check_for_type(str, s) s = utils.unicode_normalize(s) if len(s) == 0: raise ValueError('Empty string') s = s.upper() CODES = ( ('BFPV', '1'), ('CGJKQSXZ', '2'), ('DT', '3'), ('L', '4'), ('MN', '5'), ('R', '6'), ('AEIOUHWY', '.') # placeholder ) CODE_DICT = dict((c, replace) for chars, replace in CODES for c in chars) sdx = s[0] for i in range(1, len(s)): if s[i] not in CODE_DICT: continue code = CODE_DICT[s[i]] if code == '.': continue if s[i] == s[i - 1]: # ignore same letter continue if s[i - 1] in CODE_DICT and CODE_DICT[s[ i - 1]] == code: # 'side-by-side' rule continue if s[i - 1] in ('H', 'W') and i - 2 > 0 and \ s[i - 2] in CODE_DICT and CODE_DICT[s[i - 2]] != '.': # consonant separators continue sdx += code sdx = sdx[0:4].ljust(4, '0') return sdx
def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False): """ Computes TF/IDF measure. This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms. Note: If you will call this function many times, :meth:`TF_IDF` is more efficient. Args: bag1 (list): Bag 1. bag2 (list): Bag 2. df_corpus (dict): The pre calculated document frequency of corpus. doc_size (int): total documents used in corpus. math_log (bool, optional): Flag to indicate whether math.log() should be used in TF and IDF formulas. Defaults to False. Returns: float: TF/IDF cosine similarity. Examples: >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':1, 'c':1}, 3) 0.17541160386140586 >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':2, 'c':1}, 4, True) 0.12977804138 >>> rltk.tfidf(['a', 'b', 'a'], ['a'], {'a':3, 'b':1, 'c':1}, 3) 0.5547001962252291 """ # http://www.tfidf.com/ utils.check_for_none(bag1, bag2, df_corpus) utils.check_for_type(list, bag1, bag2) # term frequency for input strings t_x, t_y = collections.Counter(bag1), collections.Counter(bag2) tf_x = {k: float(v) / len(bag1) for k, v in t_x.items()} tf_y = {k: float(v) / len(bag2) for k, v in t_y.items()} # unique element total_unique_elements = set() total_unique_elements.update(bag1) total_unique_elements.update(bag2) idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 # tfidf calculation for element in total_unique_elements: if element not in df_corpus: continue idf_element = doc_size * 1.0 / df_corpus[element] v_x = 0 if element not in tf_x else (math.log(idf_element) * tf_x[element]) if math_log else ( idf_element * tf_x[element]) v_y = 0 if element not in tf_y else (math.log(idf_element) * tf_y[element]) if math_log else ( idf_element * tf_y[element]) v_x_y += v_x * v_y v_x_2 += v_x * v_x v_y_2 += v_y * v_y # cosine similarity return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
def _jaccard_index(set1, set2): utils.check_for_none(set1, set2) utils.check_for_type(set, set1, set2) if len(set1) == 0 or len(set2) == 0: return 0 return float(len(set1 & set2)) / float(len(set1 | set2))
def dice_similarity(set1, set2): utils.check_for_none(set1, set2) utils.check_for_type(set, set1, set2) if len(set1) == 0 or len(set2) == 0: return 0 return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
def damerau_levenshtein_distance(s1, s2): """ Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform\ one string into the other, where an operation is defined as an insertion, deletion, or substitution of \ a single character, or a transposition of two adjacent characters. Args: s1 (str): Sequence 1. s2 (str): Sequence 2. Returns: float: Damerau Levenshtein Distance. Examples: >>> rltk.damerau_levenshtein_distance('abcd', 'acbd') 1 >>> rltk.damerau_levenshtein_distance('abbd', 'acad') 2 """ utils.check_for_none(s1, s2) utils.check_for_type(str, s1, s2) # s1 = utils.unicode_normalize(s1) # s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) infinite = n1 + n2 char_arr = defaultdict(int) dp = [[0] * (n2 + 2) for _ in range(n1 + 2)] dp[0][0] = infinite for i in range(0, n1 + 1): dp[i + 1][0] = infinite dp[i + 1][1] = i for i in range(0, n2 + 1): dp[0][i + 1] = infinite dp[1][i + 1] = i for i in range(1, n1 + 1): db = 0 for j in range(1, n2 + 1): i1 = char_arr[s2[j - 1]] j1 = db cost = 1 if s1[i - 1] == s2[j - 1]: cost = 0 db = j dp[i + 1][j + 1] = min( dp[i][j] + cost, dp[i + 1][j] + 1, dp[i][j + 1] + 1, dp[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1)) char_arr[s1[i - 1]] = i return dp[n1 + 1][n2 + 1]
def _jaccard_index(set1, set2): utils.check_for_none(set1, set2) utils.check_for_type(set, set1, set2) if len(set1) == 0 or len(set2) == 0: return 0 # return float(len(set1 & set2)) / float(len(set1 | set2)) inter_len = len(set1 & set2) return float(inter_len) / (len(set1) + len(set2) - inter_len)
def hybrid_jaccard_similarity(set1, set2, threshold=0.5, function=jaro_winkler_similarity, parameters=None): """ Generalized Jaccard Measure. Args: set1 (set): Set 1. set2 (set): Set 2. threshold (float, optional): The threshold to keep the score of similarity function. \ Defaults to 0.5. function (function, optional): The reference of a similarity measure function. \ It should return the value in range [0,1]. If it is set to None, \ `jaro_winlker_similarity` will be used. parameters (dict, optional): Other parameters of function. Defaults to None. Returns: float: Hybrid Jaccard similarity. Examples: >>> def hybrid_test_similarity(m ,n): ... ... >>> rltk.hybrid_jaccard_similarity(set(['a','b','c']), set(['p', 'q']), function=hybrid_test_similarity) 0.533333333333 """ utils.check_for_none(set1, set2) utils.check_for_type(set, set1, set2) parameters = parameters if isinstance(parameters, dict) else {} matching_score = [] for s1 in set1: inner = [] for s2 in set2: score = function(s1, s2, **parameters) if score < threshold: score = 0.0 inner.append(1.0 - score) # munkres finds out the smallest element matching_score.append(inner) row_idx, col_idx = linear_sum_assignment(matching_score) score_sum, matching_count = 0.0, 0 for r, c in zip(row_idx, col_idx): matching_count += 1 score_sum += 1.0 - matching_score[r][c] # go back to similarity if len(set1) + len(set2) - matching_count == 0: return 1.0 return float(score_sum) / float(len(set1) + len(set2) - matching_count)
def string_equal(str1, str2): """ Args: n1 (str): String 1. n2 (str): String 2. Returns: int: 0 for unequal and 1 for equal. """ utils.check_for_none(str1, str2) utils.check_for_type(str, str1, str2) return int(str1 == str2)
def monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters=None, lower_bound=None): """ Monge Elkan similarity. Args: bag1 (list): Bag 1. bag2 (list): Bag 2. function (function, optional): The reference of a similarity measure function. \ It should return the value in range [0,1]. If it is set to None, \ `jaro_winlker_similarity` will be used. parameters (dict, optional): Other parameters of function. Defaults to None. lower_bound (float): This is for early exit. If the similarity is not possible to satisfy this value, \ the function returns immediately with the return value 0.0. Defaults to None. Returns: float: Monge Elkan similarity. Note: The order of bag1 and bag2 matters. \ Alternatively, `symmetric_monge_elkan_similarity` is not sensitive to the order. If the `lower_bound` is set, the early exit condition is more easy to be triggered if bag1 has bigger size. """ utils.check_for_none(bag1, bag2) utils.check_for_type(list, bag1, bag2) parameters = parameters if isinstance(parameters, dict) else {} score_sum = 0 for idx, ele1 in enumerate(bag1): max_score = utils.MIN_FLOAT for ele2 in bag2: max_score = max(max_score, function(ele1, ele2, **parameters)) score_sum += max_score # if it satisfies early exit condition if lower_bound: rest_max = len(bag1) - 1 - idx # assume the rest scores are all 1 if float(score_sum + rest_max) / float(len(bag1)) < lower_bound: return 0.0 sim = float(score_sum) / float(len(bag1)) if lower_bound and sim < lower_bound: return 0.0 return sim
def string_cosine_similarity(bag1, bag2): utils.check_for_none(bag1, bag2) utils.check_for_type(list, bag1, bag2) d1 = collections.Counter(bag1) d2 = collections.Counter(bag2) intersection = set(d1.keys()) & set(d2.keys()) v_x_y = sum([d1[x] * d2[x] for x in intersection]) v_x_2 = sum([v * v for k, v in d1.iteritems()]) v_y_2 = sum([v * v for k, v in d2.iteritems()]) return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
def optimal_string_alignment_distance(s1, s2): """ This is a variation of the Damerau-Levenshtein distance that returns the strings' edit distance taking into account deletion, insertion, substitution, and transposition, under the condition that no substring is edited more than once. Args: s1 (str): Sequence 1. s2 (str): Sequence 2. Returns: float: Optimal String Alignment Distance. Examples: >>> rltk.optimal_string_alignment_distance('abcd', 'acbd') 1 >>> rltk.optimal_string_alignment_distance('ca', 'abc') 3 """ utils.check_for_none(s1, s2) utils.check_for_type(str, s1, s2) # s1 = utils.unicode_normalize(s1) # s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) dp = [[0] * (n2 + 1) for _ in range(n1 + 1)] for i in range(0, n1 + 1): dp[i][0] = i for j in range(0, n2 + 1): dp[0][j] = j for i in range(1, n1 + 1): for j in range(1, n2 + 1): cost = 0 if s1[i - 1] == s2[j - 1] else 1 dp[i][j] = min(dp[i][j - 1] + 1, dp[i - 1][j] + 1, dp[i - 1][j - 1] + cost) if i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]: dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + cost) return dp[n1][n2]
def hamming_distance(s1, s2): utils.check_for_none(s1, s2) # utils.check_for_type(basestring, s1, s2) if type(s1) != type(s2): raise TypeError('Different type') if isinstance(s1, basestring) and isinstance(s2, basestring): s1 = utils.unicode_normalize(s1) s2 = utils.unicode_normalize(s2) if len(s1) != len(s2): raise ValueError('Unequal length') return sum(c1 != c2 for c1, c2 in zip(s1, s2))
def monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters={}): utils.check_for_none(bag1, bag2) utils.check_for_type(list, bag1, bag2) if len(bag1) == 0: return 0.0 score_sum = 0 for ele1 in bag1: max_score = MIN_FLOAT for ele2 in bag2: max_score = max(max_score, function(ele1, ele2, **parameters)) score_sum += max_score return float(score_sum) / float(len(bag1))
def cosine_similarity(vec1, vec2): """ vec1 & vec2 should have same length and the type of element in vector should be int / float. """ utils.check_for_none(vec1, vec2) utils.check_for_type(list, vec1, vec2) if len(vec1) != len(vec2): raise ValueError('vec1 and vec2 should have same length') v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0 for v1, v2 in zip(vec1, vec2): # list of int / float v_x_y += v1 * v2 v_x_2 += v1 * v1 v_y_2 += v2 * v2 return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
def manhattan_distance(vec1, vec2, weights=None): """ Manhattan distance. Args: vec1 (list): Vector 1. List of integer or float. vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1. weights (list): Weights for each value in vectors. If it's None, all weights will be 1.0. Defaults to None. Returns: float: Manhattan distance. """ utils.check_for_none(vec1, vec2) utils.check_for_type(list, vec1, vec2) if weights: utils.check_for_type(list, weights) if len(vec1) != len(vec2): raise ValueError('vec1 and vec2 should have same length') return cityblock(vec1, vec2, weights)
def dice_similarity(set1, set2): """ The Dice similarity score is defined as twice the intersection of two sets divided by sum of lengths. Args: set1 (set): Set 1. set2 (set): Set 2. Returns: float: Dice similarity. Examples: >>> rltk.dice_similarity(set(['a', 'b']), set(['c', 'b'])) 0.5 """ utils.check_for_none(set1, set2) utils.check_for_type(set, set1, set2) if len(set1) == 0 or len(set2) == 0: return 0 return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
def _jaro_distance(s1, s2): # code from https://github.com/nap/jaro-winkler-distance # Copyright Jean-Bernard Ratte utils.check_for_none(s1, s2) utils.check_for_type(str, s1, s2) # s1 = utils.unicode_normalize(s1) # s2 = utils.unicode_normalize(s2) shorter, longer = s1.lower(), s2.lower() if len(s1) > len(s2): longer, shorter = shorter, longer m1 = _get_matching_characters(shorter, longer) m2 = _get_matching_characters(longer, shorter) if len(m1) == 0 or len(m2) == 0: return 0.0 return (float(len(m1)) / len(shorter) + float(len(m2)) / len(longer) + float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0
def levenshtein_distance(s1, s2, insert=None, delete=None, substitute=None, insert_default=1, delete_default=1, substitute_default=1): """ The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other. Args: s1 (str): Sequence 1. s2 (str): Sequence 2. insert (dict(str, int), optional): Insert cost of characters. Defaults to None. delete (dict(str, int), optional): Delete cost of characters. Defaults to None. substitute (dict(str, dict(str, int)), optional): Substitute cost of characters. Defaults to None. insert_default (int, optional): Default value of insert cost. Defaults to 1. delete_default (int, optional): Default value of delete cost. Defaults to 1. substitute_default (int, optional): Default value of substitute cost. Defaults to 1. Returns: int: Levenshtein Distance. Examples: >>> rltk.levenshtein_distance('ab', 'abc') 1 >>> rltk.levenshtein_distance('a', 'abc', insert = {'c':50}, ... insert_default=100, delete_default=100, substitute_default=100) 150 """ utils.check_for_none(s1, s2) utils.check_for_type(str, s1, s2) insert = insert if isinstance(insert, dict) else {} delete = delete if isinstance(delete, dict) else {} substitute = substitute if isinstance(substitute, dict) else {} # s1 = utils.unicode_normalize(s1) # s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) if n1 == 0 and n2 == 0: return 0 # if n1 == 0 or n2 == 0: # return max(n1, n2) dp = [[0] * (n2 + 1) for _ in range(n1 + 1)] for i in range(n1 + 1): for j in range(n2 + 1): if i == 0 and j == 0: # [0,0] continue elif i == 0: # most top row c = s2[j - 1] dp[i][j] = insert[c] if c in insert else insert_default dp[i][j] += dp[i][j - 1] elif j == 0: # most left column c = s1[i - 1] dp[i][j] = delete[c] if c in delete else delete_default dp[i][j] += dp[i - 1][j] else: c1, c2 = s1[i - 1], s2[j - 1] insert_cost = insert[c2] if c2 in insert else insert_default delete_cost = delete[c1] if c1 in delete else delete_default substitute_cost = substitute[c1][c2] \ if c1 in substitute and c2 in substitute[c1] else substitute_default if c1 == c2: dp[i][j] = dp[i - 1][j - 1] else: dp[i][j] = min(dp[i][j - 1] + insert_cost, dp[i - 1][j] + delete_cost, dp[i - 1][j - 1] + substitute_cost) return dp[n1][n2]
def _nysiis(s): """ New York State Immunization Information System (NYSIIS) Phonetic Code is a phonetic algorithm created by `The New York State Department of Health's (NYSDOH) Bureau of Immunization <https://www.health.ny.gov/prevention/immunization/information_system/>`_. Args: s (str): Sequence. Returns: str: Coded sequence. Examples: >>> rltk.metaphone('ashcraft') 'AXKRFT' >>> rltk.metaphone('pineapple') 'PNPL' """ # code from https://github.com/jamesturk/jellyfish # Copyright (c) 2015, James Turk # Copyright (c) 2015, Sunlight Foundation # All rights reserved. utils.check_for_none(s) utils.check_for_type(basestring, s) s = utils.unicode_normalize(s) if len(s) == 0: raise ValueError('Empty string') s = s.upper() key = [] # step 1 - prefixes if s.startswith('MAC'): s = 'MCC' + s[3:] elif s.startswith('KN'): s = s[1:] elif s.startswith('K'): s = 'C' + s[1:] elif s.startswith(('PH', 'PF')): s = 'FF' + s[2:] elif s.startswith('SCH'): s = 'SSS' + s[3:] # step 2 - suffixes if s.endswith(('IE', 'EE')): s = s[:-2] + 'Y' elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')): s = s[:-2] + 'D' # step 3 - first character of key comes from name key.append(s[0]) # step 4 - translate remaining chars i = 1 len_s = len(s) while i < len_s: ch = s[i] if ch == 'E' and i + 1 < len_s and s[i + 1] == 'V': ch = 'AF' i += 1 elif ch in 'AEIOU': ch = 'A' elif ch == 'Q': ch = 'G' elif ch == 'Z': ch = 'S' elif ch == 'M': ch = 'N' elif ch == 'K': if i + 1 < len(s) and s[i + 1] == 'N': ch = 'N' else: ch = 'C' elif ch == 'S' and s[i + 1:i + 3] == 'CH': ch = 'SS' i += 2 elif ch == 'P' and i + 1 < len(s) and s[i + 1] == 'H': ch = 'F' i += 1 elif ch == 'H' and (s[i - 1] not in 'AEIOU' or (i + 1 < len(s) and s[i + 1] not in 'AEIOU')): if s[i - 1] in 'AEIOU': ch = 'A' else: ch = s[i - 1] elif ch == 'W' and s[i - 1] in 'AEIOU': ch = s[i - 1] if ch[-1] != key[-1][-1]: key.append(ch) i += 1 key = ''.join(key) # step 5 - remove trailing S if key.endswith('S') and key != 'S': key = key[:-1] # step 6 - replace AY w/ Y if key.endswith('AY'): key = key[:-2] + 'Y' # step 7 - remove trailing A if key.endswith('A') and key != 'A': key = key[:-1] # step 8 was already done return key
def levenshtein_similarity(s1, s2, insert=None, delete=None, substitute=None, insert_default=1, delete_default=1, substitute_default=1, lower_bound=None): """ Computed as 1 - levenshtein_distance / max-cost(s1,s2) """ insert = insert if isinstance(insert, dict) else {} delete = delete if isinstance(delete, dict) else {} substitute = substitute if isinstance(substitute, dict) else {} def compute_max_cost(s): return sum([ max(insert[c] if c in insert else insert_default, delete[c] if c in delete else delete_default, substitute[c] if c in substitute else substitute_default) for c in s ]) def estimate_min_char_cost(s): return min([ min(insert[c] if c in insert else insert_default, delete[c] if c in delete else delete_default, substitute[c] if c in substitute else substitute_default) for c in s ]) utils.check_for_none(s1, s2) utils.check_for_type(str, s1, s2) max_cost = max(compute_max_cost(s1), compute_max_cost(s2)) if lower_bound: diff = abs(len(s1) - len(s2)) if len(s1) == 0 and len(s2) == 0: return 1.0 elif len(s1) == 0: min_lev = float(diff * estimate_min_char_cost(s2)) elif len(s2) == 0: min_lev = float(diff * estimate_min_char_cost(s1)) else: min_lev = float( diff * min(estimate_min_char_cost(s1), estimate_min_char_cost(s2))) est_sim = 1.0 - min_lev / max_cost if est_sim < lower_bound: return 0.0 lev = levenshtein_distance(s1, s2, insert, delete, substitute, insert_default, delete_default, substitute_default) if max_cost < lev: raise ValueError('Illegal value of operation cost') if max_cost == 0: return 1.0 lev_sim = 1.0 - float(lev) / max_cost if lower_bound and lev_sim < lower_bound: return 0.0 return lev_sim
def _metaphone(s): """ Metaphone fundamentally improves on the Soundex algorithm by using information about variations and inconsistencies in English spelling and pronunciation to produce a more accurate encoding, which does a better job of matching words and names which sound similar. As with Soundex, similar-sounding words should share the same keys. Metaphone is available as a built-in operator in a number of systems. Args: s (str): Sequence. Returns: str: Coded sequence. Examples: >>> rltk.metaphone('ashcraft') 'AXKRFT' >>> rltk.metaphone('pineapple') 'PNPL' """ # code from https://github.com/jamesturk/jellyfish # Copyright (c) 2015, James Turk # Copyright (c) 2015, Sunlight Foundation # All rights reserved. utils.check_for_none(s) utils.check_for_type(basestring, s) s = utils.unicode_normalize(s) if len(s) == 0: raise ValueError('Empty string') s = s.lower() result = [] # skip first character if s starts with these if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')): s = s[1:] i = 0 while i < len(s): c = s[i] next = s[i + 1] if i < len(s) - 1 else '*****' nextnext = s[i + 2] if i < len(s) - 2 else '*****' # skip doubles except for cc if c == next and c != 'c': i += 1 continue if c in 'aeiou': if i == 0 or s[i - 1] == ' ': result.append(c) elif c == 'b': if (not (i != 0 and s[i - 1] == 'm')) or next: result.append('b') elif c == 'c': if next == 'i' and nextnext == 'a' or next == 'h': result.append('x') i += 1 elif next in 'iey': result.append('s') i += 1 else: result.append('k') elif c == 'd': if next == 'g' and nextnext in 'iey': result.append('j') i += 2 else: result.append('t') elif c in 'fjlmnr': result.append(c) elif c == 'g': if next in 'iey': result.append('j') elif next not in 'hn': result.append('k') elif next == 'h' and nextnext and nextnext not in 'aeiou': i += 1 elif c == 'h': if i == 0 or next in 'aeiou' or s[i - 1] not in 'aeiou': result.append('h') elif c == 'k': if i == 0 or s[i - 1] != 'c': result.append('k') elif c == 'p': if next == 'h': result.append('f') i += 1 else: result.append('p') elif c == 'q': result.append('k') elif c == 's': if next == 'h': result.append('x') i += 1 elif next == 'i' and nextnext in 'oa': result.append('x') i += 2 else: result.append('s') elif c == 't': if next == 'i' and nextnext in 'oa': result.append('x') elif next == 'h': result.append('0') i += 1 elif next != 'c' or nextnext != 'h': result.append('t') elif c == 'v': result.append('f') elif c == 'w': if i == 0 and next == 'h': i += 1 if nextnext in 'aeiou' or nextnext == '*****': result.append('w') elif c == 'x': if i == 0: if next == 'h' or (next == 'i' and nextnext in 'oa'): result.append('x') else: result.append('s') else: result.append('k') result.append('s') elif c == 'y': if next in 'aeiou': result.append('y') elif c == 'z': result.append('s') elif c == ' ': if len(result) > 0 and result[-1] != ' ': result.append(' ') i += 1 return ''.join(result).upper()
def ngram_similarity(s0, s1, n=2): """ N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126. Args: s1 (str): Sequence 1. s2 (str): Sequence 2. Returns: float: NGram Similarity. Examples: >>> rltk.ngram_similarity('ABCD', 'ABTUIO') 0.4166666666666667 """ utils.check_for_none(s0, s1) utils.check_for_type(str, s0, s1) n1, n2 = len(s0), len(s1) special = "\n" if (n1 == 0 or n2 == 0): return 0 if (s0 == s1): return 1 cost = 0 if (n1 < n or n2 < n): return 0 # Adding special chars (n-1) to s0 sa = special * (n - 1) + s0 s2_j = [None] * n # jth n-gram of s2 d = [0] * (n1 + 1) # cost array, horizontally p = [0] * (n1 + 1) # 'previous' cost array, horizontally for i in range(n1 + 1): p[i] = 0 for j in range(1, n2 + 1): # Construct s2_j n-gram if (j < n): for ti in range(n - j): s2_j[ti] = special for ti in range(n - j, n): s2_j[ti] = s1[ti - (n - j)] else: s2_j = list(s1[j - n:j]) d[0] = 0 for i in range(1, n1 + 1): cost = 0 tn = n # Compare sa to s2_j for ni in range(n): if sa[i - 1 + ni] == s2_j[ni] and sa[i - 1 + ni] != "\n": cost += 1 elif sa[i - 1 + ni] == special: tn -= 1 ec = float(cost) / tn # minimum of cell to the left+1, to the top+1, # diagonally left and up +cost d[i] = max(d[i - 1], p[i], p[i - 1] + ec) d2 = p p = d d = d2 return float(p[n1]) / max(n2, n1)