def get_sim_score(self, string1, string2): """Computes the normalized Hamming similarity score between two strings. Args: string1,string2 (str): Input strings. Returns: Normalized Hamming similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. ValueError : If the input strings are not of same length. Examples: >>> hd = HammingDistance() >>> hd.get_sim_score('', '') 1.0 >>> hd.get_sim_score('alex', 'john') 0.0 >>> hd.get_sim_score(' ', 'a') 0.0 >>> hd.get_sim_score('JOHN', 'john') 0.0 """ # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) raw_score = self.get_raw_score(string1, string2) common_len = len(string1) if common_len == 0: return 1.0 return 1 - (raw_score / common_len)
def get_raw_score(self, string1, string2): """Computes the raw Levenshtein distance between two strings. Args: string1,string2 (str): Input strings. Returns: Levenshtein distance (int). Raises: TypeError : If the inputs are not strings. Examples: >>> lev = Levenshtein() >>> lev.get_raw_score('a', '') 1 >>> lev.get_raw_score('example', 'samples') 3 >>> lev.get_raw_score('levenshtein', 'frankenstein') 6 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) if utils.sim_check_for_exact_match(string1, string2): return 0.0 return levenshtein(string1, string2)
def get_sim_score(self, string1, string2): """Computes the normalized Levenshtein similarity score between two strings. Args: string1,string2 (str): Input strings. Returns: Normalized Levenshtein similarity (float). Raises: TypeError : If the inputs are not strings. Examples: >>> lev = Levenshtein() >>> lev.get_sim_score('a', '') 0.0 >>> lev.get_sim_score('example', 'samples') 0.5714285714285714 >>> lev.get_sim_score('levenshtein', 'frankenstein') 0.5 """ # convert input strings to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) raw_score = self.get_raw_score(string1, string2) max_len = max(len(string1), len(string2)) if max_len == 0: return 1.0 return 1 - (raw_score / max_len)
def get_raw_score(self, string1, string2): """Computes the raw Needleman-Wunsch score between two strings. Args: string1,string2 (str) : Input strings. Returns: Needleman-Wunsch similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> nw = NeedlemanWunsch() >>> nw.get_raw_score('dva', 'deeva') 1.0 >>> nw = NeedlemanWunsch(gap_cost=0.0) >>> nw.get_raw_score('dva', 'deeve') 2.0 >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0)) >>> nw.get_raw_score('dva', 'deeve') 1.0 >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0)) >>> nw.get_raw_score('GCATGCUA', 'GATTACA') 2.5 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in xrange(len(string1) + 1): dist_mat[i, 0] = -(i * self.gap_cost) # DP initialization for j in xrange(len(string2) + 1): dist_mat[0, j] = -(j * self.gap_cost) # Needleman-Wunsch DP calculation for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + self.sim_func( string1[i - 1], string2[j - 1]) delete = dist_mat[i - 1, j] - self.gap_cost insert = dist_mat[i, j - 1] - self.gap_cost dist_mat[i, j] = max(match, delete, insert) return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
def get_raw_score(self, string1, string2): """Computes the raw Smith-Waterman score between two strings. Args: string1,string2 (str) : Input strings. Returns: Smith-Waterman similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> sw = SmithWaterman() >>> sw.get_raw_score('cat', 'hat') 2.0 >>> sw = SmithWaterman(gap_cost=2.2) >>> sw.get_raw_score('dva', 'deeve') 1.0 >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1)) >>> sw.get_raw_score('dva', 'deeve') 2.0 >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) >>> sw.get_raw_score('GCATAGCU', 'GATTACA') 6.5 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) max_value = 0 # Smith Waterman DP calculations for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + self.sim_func( string1[i - 1], string2[j - 1]) delete = dist_mat[i - 1, j] - self.gap_cost insert = dist_mat[i, j - 1] - self.gap_cost dist_mat[i, j] = max(0, match, delete, insert) max_value = max(max_value, dist_mat[i, j]) return max_value
def get_raw_score(self, string1, string2): """Computes the raw Jaro-Winkler score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro-Winkler similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jw = JaroWinkler() >>> jw.get_raw_score('MARTHA', 'MARHTA') 0.9611111111111111 >>> jw.get_raw_score('DWAYNE', 'DUANE') 0.84 >>> jw.get_raw_score('DIXON', 'DICKSONX') 0.8133333333333332 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 jw_score = Jaro().get_raw_score(string1, string2) min_len = min(len(string1), len(string2)) # prefix length can be at max 4 j = min(min_len, 4) i = 0 while i < j and string1[i] == string2[i] and string1[i]: i += 1 if i: jw_score += i * self.prefix_weight * (1 - jw_score) return jw_score
def get_raw_score(self, string1, string2): """Computes the raw hamming distance between two strings. Args: string1,string2 (str): Input strings. Returns: Hamming distance (int). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. ValueError : If the input strings are not of same length. Examples: >>> hd = HammingDistance() >>> hd.get_raw_score('', '') 0 >>> hd.get_raw_score('alex', 'john') 4 >>> hd.get_raw_score(' ', 'a') 1 >>> hd.get_raw_score('JOHN', 'john') 4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # for Hamming Distance string length should be same utils.sim_check_for_same_len(string1, string2) # sum all the mismatch characters at the corresponding index of # input strings return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
def get_raw_score(self, string1, string2): """ Computes the Fuzzy Wuzzy ratio measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str): Input strings Returns: Ratio measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = Ratio() >>> s.get_raw_score('Robert', 'Rupert') 67 >>> s.get_raw_score('Sue', 'sue') 67 >>> s.get_raw_score('example', 'samples') 71 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) sm = SequenceMatcher(None, string1, string2) return int(round(100 * sm.ratio()))
def get_raw_score(self, string1, string2): """Computes the affine gap score between two strings. This score can be outside the range [0,1]. Args: string1,string2 (str) : Input strings. Returns: Affine gap score betwen the two input strings (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> aff = Affine() >>> aff.get_raw_score('dva', 'deeva') 1.5 >>> aff = Affine(gap_start=2, gap_continuation=0.5) >>> aff.get_raw_score('dva', 'deeve') -0.5 >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 4.4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 return affine(string1, string2, self.gap_start, self.gap_continuation, self.sim_func)
def get_raw_score(self, string1, string2): """Computes the raw Needleman-Wunsch score between two strings. Args: string1,string2 (str) : Input strings. Returns: Needleman-Wunsch similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> nw = NeedlemanWunsch() >>> nw.get_raw_score('dva', 'deeva') 1.0 >>> nw = NeedlemanWunsch(gap_cost=0.0) >>> nw.get_raw_score('dva', 'deeve') 2.0 >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0)) >>> nw.get_raw_score('dva', 'deeve') 1.0 >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0)) >>> nw.get_raw_score('GCATGCUA', 'GATTACA') 2.5 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # returns the similarity score from the cython function return needleman_wunsch(string1, string2, self.gap_cost, self.sim_func)
def get_raw_score(self, string1, string2): """Computes the raw Smith-Waterman score between two strings. Args: string1,string2 (str) : Input strings. Returns: Smith-Waterman similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> sw = SmithWaterman() >>> sw.get_raw_score('cat', 'hat') 2.0 >>> sw = SmithWaterman(gap_cost=2.2) >>> sw.get_raw_score('dva', 'deeve') 1.0 >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1)) >>> sw.get_raw_score('dva', 'deeve') 2.0 >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) >>> sw.get_raw_score('GCATAGCU', 'GATTACA') 6.5 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # Returns smith waterman similarity score from cython function return smith_waterman(string1, string2, self.gap_cost, self.sim_func)
def get_raw_score(self, string1, string2): """Computes the raw Smith-Waterman score between two strings. Args: string1,string2 (str) : Input strings. Returns: Smith-Waterman similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> sw = SmithWaterman() >>> sw.get_raw_score('cat', 'hat') 2.0 >>> sw = SmithWaterman(gap_cost=2.2) >>> sw.get_raw_score('dva', 'deeve') 1.0 >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1)) >>> sw.get_raw_score('dva', 'deeve') 2.0 >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) >>> sw.get_raw_score('GCATAGCU', 'GATTACA') 6.5 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # Returns smith waterman similarity score from cython function return smith_waterman(string1,string2,self.gap_cost,self.sim_func)
def get_raw_score(self, string1, string2): """Computes the raw Jaro score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jaro = Jaro() >>> jaro.get_raw_score('MARTHA', 'MARHTA') 0.9444444444444445 >>> jaro.get_raw_score('DWAYNE', 'DUANE') 0.8222222222222223 >>> jaro.get_raw_score('DIXON', 'DICKSONX') 0.7666666666666666 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 return jaro(string1, string2)
def get_raw_score(self, string1, string2): """Computes the raw Jaro-Winkler score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro-Winkler similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jw = JaroWinkler() >>> jw.get_raw_score('MARTHA', 'MARHTA') 0.9611111111111111 >>> jw.get_raw_score('DWAYNE', 'DUANE') 0.84 >>> jw.get_raw_score('DIXON', 'DICKSONX') 0.8133333333333332 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 return jaro_winkler(string1, string2, self.prefix_weight)
def get_raw_score(self, string1, string2): """Computes the affine gap score between two strings. This score can be outside the range [0,1]. Args: string1,string2 (str) : Input strings. Returns: Affine gap score betwen the two input strings (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> aff = Affine() >>> aff.get_raw_score('dva', 'deeva') 1.5 >>> aff = Affine(gap_start=2, gap_continuation=0.5) >>> aff.get_raw_score('dva', 'deeve') -0.5 >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 4.4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 gap_start = -self.gap_start gap_continuation = -self.gap_continuation m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in xrange(1, len(string1) + 1): m[i][0] = -float("inf") x[i][0] = gap_start + (i - 1) * gap_continuation y[i][0] = -float("inf") # DP initialization for j in xrange(1, len(string2) + 1): m[0][j] = -float("inf") x[0][j] = -float("inf") y[0][j] = gap_start + (j - 1) * gap_continuation # affine gap calculation using DP for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): # best score between x_1....x_i and y_1....y_j # given that x_i is aligned to y_j m[i][j] = ( self.sim_func(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])) # the best score given that x_i is aligned to a gap x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) # the best score given that y_j is aligned to a gap y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
def get_raw_score(self, string1, string2): """ Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str): Input strings Returns: Partial Ratio measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = PartialRatio() >>> s.get_raw_score('Robert Rupert', 'Rupert') 100 >>> s.get_raw_score('Sue', 'sue') 67 >>> s.get_raw_score('example', 'samples') 86 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) # string1 should be smaller in length than string2. If this is not the case # then swap string1 and string2 if len(string1) > len(string2): temp = string1 string1 = string2 string2 = temp sm = SequenceMatcher(None, string1, string2) matching_blocks = sm.get_matching_blocks() scores = [] for block in matching_blocks: string2_starting_index = 0 if (block[1] - block[0] > 0): string2_starting_index = block[1] - block[0] string2_ending_index = string2_starting_index + len(string1) string2_substr = string2[string2_starting_index:string2_ending_index] sm2 = SequenceMatcher(None, string1, string2_substr) similarity_ratio = sm2.ratio() if similarity_ratio > .995: return 100 else: scores.append(similarity_ratio) return int(round(100 * max(scores)))
def get_raw_score(self, string1, string2): """Computes the raw Jaro score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jaro = Jaro() >>> jaro.get_raw_score('MARTHA', 'MARHTA') 0.9444444444444445 >>> jaro.get_raw_score('DWAYNE', 'DUANE') 0.8222222222222223 >>> jaro.get_raw_score('DIXON', 'DICKSONX') 0.7666666666666666 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 len_s1 = len(string1) len_s2 = len(string2) max_len = max(len_s1, len_s2) search_range = (max_len // 2) - 1 if search_range < 0: search_range = 0 flags_s1 = [False] * len_s1 flags_s2 = [False] * len_s2 common_chars = 0 for i, ch_s1 in enumerate(string1): low = i - search_range if i > search_range else 0 high = i + search_range if i + search_range < len_s2 else len_s2 - 1 for j in xrange(low, high + 1): if not flags_s2[j] and string2[j] == ch_s1: flags_s1[i] = flags_s2[j] = True common_chars += 1 break if not common_chars: return 0 k = trans_count = 0 for i, f_s1 in enumerate(flags_s1): if f_s1: for j in xrange(k, len_s2): if flags_s2[j]: k = j + 1 break if string1[i] != string2[j]: trans_count += 1 trans_count /= 2 common_chars = float(common_chars) weight = ((common_chars / len_s1 + common_chars / len_s2 + (common_chars - trans_count) / common_chars)) / 3 return weight
def get_raw_score(self, string1, string2): """Computes the affine gap score between two strings. This score can be outside the range [0,1]. Args: string1,string2 (str) : Input strings. Returns: Affine gap score betwen the two input strings (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> aff = Affine() >>> aff.get_raw_score('dva', 'deeva') 1.5 >>> aff = Affine(gap_start=2, gap_continuation=0.5) >>> aff.get_raw_score('dva', 'deeve') -0.5 >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 4.4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 gap_start = -self.gap_start gap_continuation = -self.gap_continuation m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in xrange(1, len(string1) + 1): m[i][0] = -float("inf") x[i][0] = gap_start + (i - 1) * gap_continuation y[i][0] = -float("inf") # DP initialization for j in xrange(1, len(string2) + 1): m[0][j] = -float("inf") x[0][j] = -float("inf") y[0][j] = gap_start + (j - 1) * gap_continuation # affine gap calculation using DP for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): # best score between x_1....x_i and y_1....y_j # given that x_i is aligned to y_j m[i][j] = (self.sim_func(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])) # the best score given that x_i is aligned to a gap x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) # the best score given that y_j is aligned to a gap y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
def bert_embed(bert_model, string): with torch.no_grad(): return bert_model.encode(utils.convert_to_unicode(string))