def needleman_wunsch_score(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table={}): utils.check_for_none(s1, s2) utils.check_for_type(basestring, s1, s2) s1 = utils.unicode_normalize(s1) s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) if n1 == 0 and n2 == 0: return 0 # construct matrix to get max score of all possible alignments dp = [[0] * (n2 + 1) for _ in range(n1 + 1)] for i in xrange(n1 + 1): for j in xrange(n2 + 1): if i == 0 and j == 0: # [0,0] continue elif i == 0: # most top row dp[i][j] = gap + dp[i][j - 1] elif j == 0: # most left column dp[i][j] = gap + dp[i - 1][j] else: dp[i][j] = max( dp[i][j - 1] + gap, dp[i - 1][j] + gap, dp[i - 1][j - 1] + _get_score( s1[i - 1], s2[j - 1], match, mismatch, score_table)) return dp[n1][n2]
def damerau_levenshtein_distance(s1, s2): """ Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a transposition of two adjacent characters. Args: s1 (str): Sequence 1. s2 (str): Sequence 2. Returns: float: Damerau Levenshtein Distance. Examples: >>> rltk.damerau_levenshtein_distance('abcd', 'acbd') 1 >>> rltk.damerau_levenshtein_distance('abbd', 'acad') 2 """ utils.check_for_none(s1, s2) utils.check_for_type(basestring, s1, s2) s1 = utils.unicode_normalize(s1) s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) infinite = n1 + n2 char_arr = defaultdict(int) dp = [[0] * (n2 + 2) for _ in xrange(n1 + 2)] dp[0][0] = infinite for i in xrange(0, n1 + 1): dp[i + 1][0] = infinite dp[i + 1][1] = i for i in xrange(0, n2 + 1): dp[0][i + 1] = infinite dp[1][i + 1] = i for i in xrange(1, n1 + 1): db = 0 for j in xrange(1, n2 + 1): i1 = char_arr[s2[j - 1]] j1 = db cost = 1 if s1[i - 1] == s2[j - 1]: cost = 0 db = j dp[i + 1][j + 1] = min(dp[i][j] + cost, dp[i + 1][j] + 1, dp[i][j + 1] + 1, dp[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1)) char_arr[s1[i - 1]] = i return dp[n1 + 1][n2 + 1]
def soundex(s): """ The standard used for this implementation is provided by `U.S. Census Bureau <https://www.archives.gov/research/census/soundex.html>`_. Args: s (str): Sequence. Returns: str: Coded sequence. Examples: >>> rltk.soundex('ashcraft') 'A261' >>> rltk.soundex('pineapple') 'P514' """ utils.check_for_none(s) utils.check_for_type(str, s) s = utils.unicode_normalize(s) if len(s) == 0: raise ValueError('Empty string') s = s.upper() CODES = ( ('BFPV', '1'), ('CGJKQSXZ', '2'), ('DT', '3'), ('L', '4'), ('MN', '5'), ('R', '6'), ('AEIOUHWY', '.') # placeholder ) CODE_DICT = dict((c, replace) for chars, replace in CODES for c in chars) sdx = s[0] for i in range(1, len(s)): if s[i] not in CODE_DICT: continue code = CODE_DICT[s[i]] if code == '.': continue if s[i] == s[i - 1]: # ignore same letter continue if s[i - 1] in CODE_DICT and CODE_DICT[s[ i - 1]] == code: # 'side-by-side' rule continue if s[i - 1] in ('H', 'W') and i - 2 > 0 and \ s[i - 2] in CODE_DICT and CODE_DICT[s[i - 2]] != '.': # consonant separators continue sdx += code sdx = sdx[0:4].ljust(4, '0') return sdx
def hamming_distance(s1, s2): utils.check_for_none(s1, s2) # utils.check_for_type(basestring, s1, s2) if type(s1) != type(s2): raise TypeError('Different type') if isinstance(s1, basestring) and isinstance(s2, basestring): s1 = utils.unicode_normalize(s1) s2 = utils.unicode_normalize(s2) if len(s1) != len(s2): raise ValueError('Unequal length') return sum(c1 != c2 for c1, c2 in zip(s1, s2))
def optimal_string_alignment_distance(s1, s2): """ This is a variation of the Damerau-Levenshtein distance that returns the strings' edit distance taking into account deletion, insertion, substitution, and transposition, under the condition that no substring is edited more than once. Args: s1 (str): Sequence 1. s2 (str): Sequence 2. Returns: float: Optimal String Alignment Distance. Examples: >>> rltk.optimal_string_alignment_distance('abcd', 'acbd') 1 >>> rltk.optimal_string_alignment_distance('ca', 'abc') 3 """ utils.check_for_none(s1, s2) utils.check_for_type(basestring, s1, s2) s1 = utils.unicode_normalize(s1) s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) dp = [[0] * (n2 + 1) for _ in xrange(n1 + 1)] for i in xrange(0, n1 + 1): dp[i][0] = i for j in xrange(0, n2 + 1): dp[0][j] = j for i in xrange(1, n1 + 1): for j in xrange(1, n2 + 1): cost = 0 if s1[i - 1] == s2[j - 1] else 1 dp[i][j] = min(dp[i][j - 1] + 1, dp[i - 1][j] + 1, dp[i - 1][j - 1] + cost) if (i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]): dp[i][j] = min(dp[i][j], dp[i - 2][j - 2] + cost) return dp[n1][n2]
def _jaro_distance(s1, s2): # code from https://github.com/nap/jaro-winkler-distance # Copyright Jean-Bernard Ratte utils.check_for_none(s1, s2) utils.check_for_type(basestring, s1, s2) s1 = utils.unicode_normalize(s1) s2 = utils.unicode_normalize(s2) shorter, longer = s1.lower(), s2.lower() if len(s1) > len(s2): longer, shorter = shorter, longer m1 = _get_matching_characters(shorter, longer) m2 = _get_matching_characters(longer, shorter) if len(m1) == 0 or len(m2) == 0: return 0.0 return (float(len(m1)) / len(shorter) + float(len(m2)) / len(longer) + float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0
def _metaphone(s): """ Metaphone fundamentally improves on the Soundex algorithm by using information about variations and inconsistencies in English spelling and pronunciation to produce a more accurate encoding, which does a better job of matching words and names which sound similar. As with Soundex, similar-sounding words should share the same keys. Metaphone is available as a built-in operator in a number of systems. Args: s (str): Sequence. Returns: str: Coded sequence. Examples: >>> rltk.metaphone('ashcraft') 'AXKRFT' >>> rltk.metaphone('pineapple') 'PNPL' """ # code from https://github.com/jamesturk/jellyfish # Copyright (c) 2015, James Turk # Copyright (c) 2015, Sunlight Foundation # All rights reserved. utils.check_for_none(s) utils.check_for_type(basestring, s) s = utils.unicode_normalize(s) if len(s) == 0: raise ValueError('Empty string') s = s.lower() result = [] # skip first character if s starts with these if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')): s = s[1:] i = 0 while i < len(s): c = s[i] next = s[i + 1] if i < len(s) - 1 else '*****' nextnext = s[i + 2] if i < len(s) - 2 else '*****' # skip doubles except for cc if c == next and c != 'c': i += 1 continue if c in 'aeiou': if i == 0 or s[i - 1] == ' ': result.append(c) elif c == 'b': if (not (i != 0 and s[i - 1] == 'm')) or next: result.append('b') elif c == 'c': if next == 'i' and nextnext == 'a' or next == 'h': result.append('x') i += 1 elif next in 'iey': result.append('s') i += 1 else: result.append('k') elif c == 'd': if next == 'g' and nextnext in 'iey': result.append('j') i += 2 else: result.append('t') elif c in 'fjlmnr': result.append(c) elif c == 'g': if next in 'iey': result.append('j') elif next not in 'hn': result.append('k') elif next == 'h' and nextnext and nextnext not in 'aeiou': i += 1 elif c == 'h': if i == 0 or next in 'aeiou' or s[i - 1] not in 'aeiou': result.append('h') elif c == 'k': if i == 0 or s[i - 1] != 'c': result.append('k') elif c == 'p': if next == 'h': result.append('f') i += 1 else: result.append('p') elif c == 'q': result.append('k') elif c == 's': if next == 'h': result.append('x') i += 1 elif next == 'i' and nextnext in 'oa': result.append('x') i += 2 else: result.append('s') elif c == 't': if next == 'i' and nextnext in 'oa': result.append('x') elif next == 'h': result.append('0') i += 1 elif next != 'c' or nextnext != 'h': result.append('t') elif c == 'v': result.append('f') elif c == 'w': if i == 0 and next == 'h': i += 1 if nextnext in 'aeiou' or nextnext == '*****': result.append('w') elif c == 'x': if i == 0: if next == 'h' or (next == 'i' and nextnext in 'oa'): result.append('x') else: result.append('s') else: result.append('k') result.append('s') elif c == 'y': if next in 'aeiou': result.append('y') elif c == 'z': result.append('s') elif c == ' ': if len(result) > 0 and result[-1] != ' ': result.append(' ') i += 1 return ''.join(result).upper()
def levenshtein_distance(s1, s2, insert={}, delete={}, substitute={}, insert_default=1, delete_default=1, substitute_default=1): """ The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other. Args: s1 (str): Sequence 1. s2 (str): Sequence 2. insert (dict(str, int), optional): Insert cost of characters. Defaults to empty dict. delete (dict(str, int), optional): Delete cost of characters. Defaults to empty dict. substitute (dict(str, dict(str, int)), optional): Substitute cost of characters. Defaults to empty dict. insert_default (int, optional): Default value of insert cost. Defaults to 1. delete_default (int, optional): Default value of delete cost. Defaults to 1. substitute_default (int, optional): Default value of substitute cost. Defaults to 1. Returns: int: Levenshtein Distance. Examples: >>> rltk.levenshtein_distance('ab', 'abc') 1 >>> rltk.levenshtein_distance('a', 'abc', insert = {'c':50}, ... insert_default=100, delete_default=100, substitute_default=100) 150 """ utils.check_for_none(s1, s2) utils.check_for_type(basestring, s1, s2) s1 = utils.unicode_normalize(s1) s2 = utils.unicode_normalize(s2) n1, n2 = len(s1), len(s2) if n1 == 0 and n2 == 0: return 0 # if n1 == 0 or n2 == 0: # return max(n1, n2) dp = [[0] * (n2 + 1) for _ in range(n1 + 1)] for i in xrange(n1 + 1): for j in xrange(n2 + 1): if i == 0 and j == 0: # [0,0] continue elif i == 0: # most top row c = s2[j-1] dp[i][j] = insert[c] if c in insert else insert_default dp[i][j] += dp[i][j-1] elif j == 0: # most left column c = s1[i-1] dp[i][j] = delete[c] if c in delete else delete_default dp[i][j] += dp[i-1][j] else: c1, c2 = s1[i-1], s2[j-1] insert_cost = insert[c2] if c2 in insert else insert_default delete_cost = delete[c1] if c1 in delete else delete_default substitute_cost = substitute[c1][c2] \ if c1 in substitute and c2 in substitute[c1] else substitute_default if c1 == c2: dp[i][j] = dp[i-1][j-1] else: dp[i][j] = min(dp[i][j-1] + insert_cost, dp[i-1][j] + delete_cost, dp[i-1][j-1] + substitute_cost) return dp[n1][n2]
def _nysiis(s): """ New York State Immunization Information System (NYSIIS) Phonetic Code is a phonetic algorithm created by `The New York State Department of Health's (NYSDOH) Bureau of Immunization <https://www.health.ny.gov/prevention/immunization/information_system/>`_. Args: s (str): Sequence. Returns: str: Coded sequence. Examples: >>> rltk.metaphone('ashcraft') 'AXKRFT' >>> rltk.metaphone('pineapple') 'PNPL' """ # code from https://github.com/jamesturk/jellyfish # Copyright (c) 2015, James Turk # Copyright (c) 2015, Sunlight Foundation # All rights reserved. utils.check_for_none(s) utils.check_for_type(basestring, s) s = utils.unicode_normalize(s) if len(s) == 0: raise ValueError('Empty string') s = s.upper() key = [] # step 1 - prefixes if s.startswith('MAC'): s = 'MCC' + s[3:] elif s.startswith('KN'): s = s[1:] elif s.startswith('K'): s = 'C' + s[1:] elif s.startswith(('PH', 'PF')): s = 'FF' + s[2:] elif s.startswith('SCH'): s = 'SSS' + s[3:] # step 2 - suffixes if s.endswith(('IE', 'EE')): s = s[:-2] + 'Y' elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')): s = s[:-2] + 'D' # step 3 - first character of key comes from name key.append(s[0]) # step 4 - translate remaining chars i = 1 len_s = len(s) while i < len_s: ch = s[i] if ch == 'E' and i + 1 < len_s and s[i + 1] == 'V': ch = 'AF' i += 1 elif ch in 'AEIOU': ch = 'A' elif ch == 'Q': ch = 'G' elif ch == 'Z': ch = 'S' elif ch == 'M': ch = 'N' elif ch == 'K': if i + 1 < len(s) and s[i + 1] == 'N': ch = 'N' else: ch = 'C' elif ch == 'S' and s[i + 1:i + 3] == 'CH': ch = 'SS' i += 2 elif ch == 'P' and i + 1 < len(s) and s[i + 1] == 'H': ch = 'F' i += 1 elif ch == 'H' and (s[i - 1] not in 'AEIOU' or (i + 1 < len(s) and s[i + 1] not in 'AEIOU')): if s[i - 1] in 'AEIOU': ch = 'A' else: ch = s[i - 1] elif ch == 'W' and s[i - 1] in 'AEIOU': ch = s[i - 1] if ch[-1] != key[-1][-1]: key.append(ch) i += 1 key = ''.join(key) # step 5 - remove trailing S if key.endswith('S') and key != 'S': key = key[:-1] # step 6 - replace AY w/ Y if key.endswith('AY'): key = key[:-2] + 'Y' # step 7 - remove trailing A if key.endswith('A') and key != 'A': key = key[:-1] # step 8 was already done return key