def monge_elkan(bag1, bag2, sim_func=jaro_winkler): """ Compute Monge-Elkan similarity measure between two bags (lists). The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of sequence-based and set-based methods. This can be effective for domains in which more control is needed over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute over all similarity score. Args: bag1,bag2 (list): Input lists sim_func (function): Secondary similarity function. This is expected to be a sequence-based similarity measure (defaults to levenshtein) Returns: Monge-Elkan similarity score (float) Raises: TypeError : If the inputs are not lists or if one of the inputs is None Examples: >>> monge_elkan(['Niall'], ['Neal']) 0.8049999999999999 >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 0.8677218614718616 >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch) 2.0 >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine) 2.25 >>> monge_elkan([''], ['a']) 0.0 >>> monge_elkan(['Niall'], ['Nigel']) 0.7866666666666667 References: * Principles of Data Integration book """ # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if exact match return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # aggregated sum of all the max sim score of all the elements in bag1 # with elements in bag2 sum_of_maxes = 0 for t1 in bag1: max_sim = float('-inf') for t2 in bag2: max_sim = max(max_sim, sim_func(t1, t2)) sum_of_maxes += max_sim sim = float(sum_of_maxes) / float(len(bag1)) return sim
def get_raw_score(self, bag1, bag2): """Computes the raw Monge-Elkan score between two bags (lists). Args: bag1,bag2 (list): Input lists. Returns: Monge-Elkan similarity score (float). Raises: TypeError : If the inputs are not lists or if one of the inputs is None. Examples: >>> me = MongeElkan() >>> me.get_raw_score(['Niall'], ['Neal']) 0.8049999999999999 >>> me.get_raw_score(['Niall'], ['Nigel']) 0.7866666666666667 >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 0.8677218614718616 >>> me.get_raw_score([''], ['a']) 0.0 >>> me = MongeElkan(sim_func=NeedlemanWunsch().get_raw_score) >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 2.0 >>> me = MongeElkan(sim_func=Affine().get_raw_score) >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 2.25 References: * Principles of Data Integration book """ # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if exact match return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # aggregated sum of all the max sim score of all the elements in bag1 # with elements in bag2 sum_of_maxes = 0 for el1 in bag1: max_sim = float('-inf') for el2 in bag2: max_sim = max(max_sim, self.sim_func(el1, el2)) sum_of_maxes += max_sim sim = float(sum_of_maxes) / float(len(bag1)) return sim
def get_raw_score(self, set1, set2): """ Computes the Tversky index similarity between two sets. The Tversky index is an asymmetric similarity measure on sets that compares a variant to a prototype. The Tversky index can be seen as a generalization of Dice's coefficient and Tanimoto coefficient. For sets X and Y the Tversky index is a number between 0 and 1 given by: :math:`tversky_index(X, Y) = \\frac{|X \\cap Y|}{|X \\cap Y| + \alpha |X-Y| + \beta |Y-X|}` where, :math: \alpha, \beta >=0 Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Tversly index similarity (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> tvi = TverskyIndex() >>> tvi.get_raw_score(['data', 'science'], ['data']) 0.6666666666666666 >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 0.5 >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 0.5454545454545454 >>> tvi = TverskyIndex(0.5, 0.5) >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 0.5454545454545454 >>> tvi = TverskyIndex(beta=0.5) >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 0.5 """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) intersection = float(len(set1 & set2)) return 1.0 * intersection / (intersection + (self.alpha * len(set1 - set2)) + (self.beta * len(set2 - set1)))
def get_word_vector_similarities_simple(self, bag1, bag2): # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) # if corpus is not provided treat input string as corpus curr_df, corpus_size = (self.__document_frequency, self.__corpus_size) # calculating the term sim score against the input string 2, # construct similarity map similarity_map = {} for term_x in tf_x: max_score = 0.0 for term_y in tf_y: score = self.sim_func(term_x, term_y) # adding sim only if it is above threshold and # highest for this element if score > self.threshold and score > max_score: similarity_map[term_x] = (term_x, term_y, score) max_score = score # position of first string, second string and sim score # in the tuple first_string_pos = 0 second_string_pos = 1 sim_score_pos = 2 # create a word vector with all the words in the document collection for every comparision. # if the word exist in this similarity-map, add the soft TF/ID value. If not, add a 0 word_similarities_vector = np.zeros(len(curr_df)) for idx, element in enumerate(curr_df.keys()): if element in similarity_map: sim = similarity_map[element] word_similarities_vector[idx] = sim[sim_score_pos] else: word_similarities_vector[idx] = 0 return word_similarities_vector
def get_raw_score(self, string1, string2, force_ascii=True, full_process=True): """ Computes the Fuzzy Wuzzy token sort measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str), : Input strings force_ascii (boolean) : Flag to remove non-ascii characters or not full_process (boolean) : Flag to process the string or not. Processing includes removing non alphanumeric characters, converting string to lower case and removing leading and trailing whitespaces. Returns: Token Sort measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = TokenSort() >>> s.get_raw_score('great is scala', 'java is great') 81 >>> s.get_raw_score('Sue', 'sue') 100 >>> s.get_raw_score('C++ and Java', 'Java and Python') 64 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process) sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process) ratio = Ratio() return ratio.get_raw_score(sorted1, sorted2)
def get_raw_score(self, string1, string2): """Computes the raw Jaro-Winkler score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro-Winkler similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jw = JaroWinkler() >>> jw.get_raw_score('MARTHA', 'MARHTA') 0.9611111111111111 >>> jw.get_raw_score('DWAYNE', 'DUANE') 0.84 >>> jw.get_raw_score('DIXON', 'DICKSONX') 0.8133333333333332 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 jw_score = Jaro().get_raw_score(string1, string2) min_len = min(len(string1), len(string2)) # prefix length can be at max 4 j = min(min_len, 4) i = 0 while i < j and string1[i] == string2[i] and string1[i]: i += 1 if i: jw_score += i * self.prefix_weight * (1 - jw_score) return jw_score
def overlap_coefficient(set1, set2): """ Computes the overlap coefficient between two sets. The overlap coefficient is a similarity measure related to the Jaccard measure that measures the overlap between two sets, and is defined as the size of the intersection divided by the smaller of the size of the two sets. For two sets X and Y, the overlap coefficient is: :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}` Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Overlap coefficient (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> (overlap_coefficient([], []) 1.0 >>> overlap_coefficient([], ['data']) 0 >>> overlap_coefficient(['data', 'science'], ['data']) 1.0 References: * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient * Simmetrics library """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return float(len(set1 & set2)) / min(len(set1), len(set2))
def jaro_winkler(string1, string2, prefix_weight=0.1): """ Computes the Jaro-Winkler measure between two strings. The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix and thus are likely to match. Args: string1,string2 (str): Input strings prefix_weight (float): Weight to give the prefix (defaults to 0.1) Returns: Jaro-Winkler measure (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jaro_winkler('MARTHA', 'MARHTA') 0.9611111111111111 >>> jaro_winkler('DWAYNE', 'DUANE') 0.84 >>> jaro_winkler('DIXON', 'DICKSONX') 0.8133333333333332 """ # input validations utils.sim_check_for_none(string1, string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 jw_score = jaro(string1, string2) min_len = min(len(string1), len(string2)) # prefix length can be at max 4 j = min(min_len, 4) i = 0 while i < j and string1[i] == string2[i] and string1[i]: i += 1 if i: jw_score += i * prefix_weight * (1 - jw_score) return jw_score
def jaccard(set1, set2): """ Computes the Jaccard measure between two sets. The Jaccard measure, also known as the Jaccard similarity coefficient, is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets. For two sets X and Y, the Jaccard measure is: :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X| \\cup |Y|}` Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Jaccard similarity (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> jaccard(['data', 'science'], ['data']) 0.5 >>> jaccard({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 0.375 >>> jaccard(['data', 'management'], ['data', 'data', 'science']) 0.3333333333333333 """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return float(len(set1 & set2)) / float(len(set1 | set2))
def cosine(set1, set2): """ Computes the cosine similarity between two sets. For two sets X and Y, the cosine similarity is: :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}` Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Cosine similarity (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> cosine(['data', 'science'], ['data']) 0.7071067811865475 >>> cosine(['data', 'data', 'science'], ['data', 'management']) 0.4999999999999999 >>> cosine([], ['data']) 0.0 References: * String similarity joins: An Experimental Evaluation (VLDB 2014) * Project flamingo : Mike carey, Vernica """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) * math.sqrt(float(len(set2))))
def get_raw_score(self, set1, set2): """Computes the raw Dice score between two sets. This score is already in [0,1]. Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Dice similarity score (float). Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> dice = Dice() >>> dice.get_raw_score(['data', 'science'], ['data']) 0.6666666666666666 >>> dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 0.5454545454545454 >>> dice.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 0.5 References: * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient * SimMetrics library. """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
def get_raw_score(self, set1, set2): """Computes the raw overlap coefficient score between two sets. Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Overlap coefficient (float). Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> oc = OverlapCoefficient() >>> oc.get_raw_score(['data', 'science'], ['data']) 1.0 >>> oc.get_raw_score([], []) 1.0 >>> oc.get_raw_score([], ['data']) 0 References: * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient * SimMetrics library """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return float(len(set1 & set2)) / min(len(set1), len(set2))
def get_raw_score(self, set1, set2): """Computes the raw cosine score between two sets. Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Cosine similarity (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> cos = Cosine() >>> cos.get_raw_score(['data', 'science'], ['data']) 0.7071067811865475 >>> cos.get_raw_score(['data', 'data', 'science'], ['data', 'management']) 0.4999999999999999 >>> cos.get_raw_score([], ['data']) 0.0 References: * String similarity joins: An Experimental Evaluation (a paper appearing in the VLDB 2014 Conference). * Project Flamingo at http://flamingo.ics.uci.edu. """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) * math.sqrt(float(len(set2))))
def get_raw_score(self, string1, string2, force_ascii=True, full_process=True): """ Computes the Fuzzy Wuzzy partial token sort measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str), : Input strings force_ascii (boolean) : Flag to remove non-ascii characters or not full_process (boolean) : Flag to process the string or not. Processing includes removing non alphanumeric characters, converting string to lower case and removing leading and trailing whitespaces. Returns: Partial Token Sort measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = PartialTokenSort() >>> s.get_raw_score('great is scala', 'java is great') 81 >>> s.get_raw_score('Sue', 'sue') 100 >>> s.get_raw_score('C++ and Java', 'Java and Python') 64 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process) sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process) partialRatio = PartialRatio() return partialRatio.get_raw_score(sorted1, sorted2)
def get_raw_score(self, set1, set2): """Computes the raw Jaccard score between two sets. Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Jaccard similarity score (float). Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> jac = Jaccard() >>> jac.get_raw_score(['data', 'science'], ['data']) 0.5 >>> jac.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 0.375 >>> jac.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 0.3333333333333333 """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return float(len(set1 & set2)) / float(len(set1 | set2))
def get_raw_score(self, string1, string2): """ Computes the Fuzzy Wuzzy ratio measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str): Input strings Returns: Ratio measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = Ratio() >>> s.get_raw_score('Robert', 'Rupert') 67 >>> s.get_raw_score('Sue', 'sue') 67 >>> s.get_raw_score('example', 'samples') 71 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) sm = SequenceMatcher(None, string1, string2) return int(round(100 * sm.ratio()))
def get_raw_score(self, string1, string2): """Computes the affine gap score between two strings. This score can be outside the range [0,1]. Args: string1,string2 (str) : Input strings. Returns: Affine gap score betwen the two input strings (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> aff = Affine() >>> aff.get_raw_score('dva', 'deeva') 1.5 >>> aff = Affine(gap_start=2, gap_continuation=0.5) >>> aff.get_raw_score('dva', 'deeve') -0.5 >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 4.4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 return affine(string1, string2, self.gap_start, self.gap_continuation, self.sim_func)
def get_raw_score(self, string1, string2): """Computes the raw Jaro-Winkler score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro-Winkler similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jw = JaroWinkler() >>> jw.get_raw_score('MARTHA', 'MARHTA') 0.9611111111111111 >>> jw.get_raw_score('DWAYNE', 'DUANE') 0.84 >>> jw.get_raw_score('DIXON', 'DICKSONX') 0.8133333333333332 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 return jaro_winkler(string1, string2, self.prefix_weight)
def get_raw_score(self, string1, string2): """Computes the raw Jaro score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jaro = Jaro() >>> jaro.get_raw_score('MARTHA', 'MARHTA') 0.9444444444444445 >>> jaro.get_raw_score('DWAYNE', 'DUANE') 0.8222222222222223 >>> jaro.get_raw_score('DIXON', 'DICKSONX') 0.7666666666666666 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 return jaro(string1, string2)
def get_sim_score(self, string1, string2): """ Computes the Fuzzy Wuzzy ratio similarity score between two strings. This score is in the range [0,1]. Args: string1,string2 (str): Input strings Returns: Ratio measure similarity score (float) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = Ratio() >>> s.get_sim_score('Robert', 'Rupert') 0.67 >>> s.get_sim_score('Sue', 'sue') 0.67 >>> s.get_sim_score('example', 'samples') 0.71 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 raw_score = 1.0 * self.get_raw_score(string1, string2) sim_score = raw_score / 100 return sim_score
def dice(set1, set2): """ Computes the Dice similarity coefficient between two sets. The similarity is defined as twice the shared information (intersection) divided by sum of cardinalities. For two sets X and Y, the Dice similarity coefficient is: :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}` Args: set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. Returns: Dice similarity coefficient (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. Examples: >>> dice(['data', 'science'], ['data']) 0.6666666666666666 >>> dice({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 0.5454545454545454 >>> dice(['data', 'management'], ['data', 'data', 'science']) 0.5 References: * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient * Simmetrics library """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
def get_sim_score(self, string1, string2): """ Computes the Fuzzy Wuzzy partial ratio similarity score between two strings. This score is in the range [0,1]. Args: string1,string2 (str): Input strings Returns: Partial Ratio measure similarity score (float) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = PartialRatio() >>> s.get_sim_score('Robert Rupert', 'Rupert') 1.0 >>> s.get_sim_score('Sue', 'sue') 0.67 >>> s.get_sim_score('example', 'samples') 0.86 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 raw_score = 1.0 * self.get_raw_score(string1, string2) sim_score = raw_score / 100 return sim_score
def get_raw_score(self, string1, string2): """Computes the raw Jaro score between two strings. Args: string1,string2 (str): Input strings. Returns: Jaro similarity score (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jaro = Jaro() >>> jaro.get_raw_score('MARTHA', 'MARHTA') 0.9444444444444445 >>> jaro.get_raw_score('DWAYNE', 'DUANE') 0.8222222222222223 >>> jaro.get_raw_score('DIXON', 'DICKSONX') 0.7666666666666666 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 len_s1 = len(string1) len_s2 = len(string2) max_len = max(len_s1, len_s2) search_range = (max_len // 2) - 1 if search_range < 0: search_range = 0 flags_s1 = [False] * len_s1 flags_s2 = [False] * len_s2 common_chars = 0 for i, ch_s1 in enumerate(string1): low = i - search_range if i > search_range else 0 high = i + search_range if i + search_range < len_s2 else len_s2 - 1 for j in xrange(low, high + 1): if not flags_s2[j] and string2[j] == ch_s1: flags_s1[i] = flags_s2[j] = True common_chars += 1 break if not common_chars: return 0 k = trans_count = 0 for i, f_s1 in enumerate(flags_s1): if f_s1: for j in xrange(k, len_s2): if flags_s2[j]: k = j + 1 break if string1[i] != string2[j]: trans_count += 1 trans_count /= 2 common_chars = float(common_chars) weight = ((common_chars / len_s1 + common_chars / len_s2 + (common_chars - trans_count) / common_chars)) / 3 return weight
def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident): """ Computes the Affine gap score between two strings. The Affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more gracefully. For more information refer to string matching chapter in the DI book. Args: string1,string2 (str) : Input strings gap_start (float): Cost for the gap at the start (defaults to 1) gap_continuation (float) : Cost for the gap continuation (defaults to 0.5) sim_score (function) : Function computing similarity score between two chars, represented as strings (defaults to identity). Returns: Affine gap score (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> affine('dva', 'deeva') 1.5 >>> affine('dva', 'deeve', gap_start=2, gap_continuation=0.5) -0.5 >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0))) 4.4 """ # input validations utils.sim_check_for_none(string1, string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 gap_start = -gap_start gap_continuation = -gap_continuation m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in _range(1, len(string1) + 1): m[i][0] = -float("inf") x[i][0] = gap_start + (i - 1) * gap_continuation y[i][0] = -float("inf") # DP initialization for j in _range(1, len(string2) + 1): m[0][j] = -float("inf") x[0][j] = -float("inf") y[0][j] = gap_start + (j - 1) * gap_continuation # affine gap calculation using DP for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j m[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1]) # the best score given that x_i is aligned to a gap x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) # the best score given that y_j is aligned to a gap y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
def get_raw_score(self, bag1, bag2): """Computes the raw soft TF/IDF score between two lists given the corpus information. Args: bag1,bag2 (list): Input lists Returns: Soft TF/IDF score between the input lists (float). Raises: TypeError : If the inputs are not lists or if one of the inputs is None. Examples: >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8) >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c']) 0.17541160386140586 >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9) >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.5547001962252291 >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']]) >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.0 >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6) >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba']) 0.81649658092772592 References: * the string matching chapter of the "Principles of Data Integration" book. """ # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) # find unique elements in the input lists and their document frequency local_df = {} for element in tf_x: local_df[element] = local_df.get(element, 0) + 1 for element in tf_y: local_df[element] = local_df.get(element, 0) + 1 # if corpus is not provided treat input string as corpus curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else ( (self.__document_frequency, self.__corpus_size)) # calculating the term sim score against the input string 2, # construct similarity map similarity_map = {} for term_x in tf_x: max_score = 0.0 for term_y in tf_y: score = self.sim_func(term_x, term_y) # adding sim only if it is above threshold and # highest for this element if score > self.threshold and score > max_score: similarity_map[term_x] = (term_x, term_y, score) max_score = score # position of first string, second string and sim score # in the tuple first_string_pos = 0 second_string_pos = 1 sim_score_pos = 2 result, v_x_2, v_y_2 = 0.0, 0.0, 0.0 # soft-tfidf calculation for element in local_df.keys(): if curr_df.get(element) is None: continue # numerator if element in similarity_map: sim = similarity_map[element] idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1) idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1) v_x = log(idf_first) * log(tf_x.get(sim[first_string_pos], 0) + 1) if self.dampen else idf_first * tf_x.get(sim[first_string_pos], 0) v_y = log(idf_second) * log(tf_y.get(sim[second_string_pos], 0) + 1) if self.dampen else idf_second * tf_y.get(sim[second_string_pos], 0) result += v_x * v_y * sim[sim_score_pos] # denominator idf = corpus_size / curr_df[element] v_x = log(idf) * log(tf_x.get(element, 0) + 1) if self.dampen else idf * tf_x.get(element, 0) v_x_2 += v_x * v_x v_y = log(idf) * log(tf_y.get(element, 0) + 1) if self.dampen else idf * tf_y.get(element, 0) v_y_2 += v_y * v_y return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
def get_raw_score(self, string1, string2): """Computes the affine gap score between two strings. This score can be outside the range [0,1]. Args: string1,string2 (str) : Input strings. Returns: Affine gap score betwen the two input strings (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> aff = Affine() >>> aff.get_raw_score('dva', 'deeva') 1.5 >>> aff = Affine(gap_start=2, gap_continuation=0.5) >>> aff.get_raw_score('dva', 'deeve') -0.5 >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 4.4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 gap_start = -self.gap_start gap_continuation = -self.gap_continuation m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in xrange(1, len(string1) + 1): m[i][0] = -float("inf") x[i][0] = gap_start + (i - 1) * gap_continuation y[i][0] = -float("inf") # DP initialization for j in xrange(1, len(string2) + 1): m[0][j] = -float("inf") x[0][j] = -float("inf") y[0][j] = gap_start + (j - 1) * gap_continuation # affine gap calculation using DP for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): # best score between x_1....x_i and y_1....y_j # given that x_i is aligned to y_j m[i][j] = ( self.sim_func(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])) # the best score given that x_i is aligned to a gap x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) # the best score given that y_j is aligned to a gap y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
def jaro(string1, string2): """ Computes the Jaro measure between two strings. The Jaro measure is a type of edit distance, This was developed mainly to compare short strings, such as first and last names. Args: string1,string2 (str): Input strings Returns: Jaro measure (float) Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> jaro('MARTHA', 'MARHTA') 0.9444444444444445 >>> jaro('DWAYNE', 'DUANE') 0.8222222222222223 >>> jaro('DIXON', 'DICKSONX') 0.7666666666666666 """ # input validations utils.sim_check_for_none(string1, string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 len_s1 = len(string1) len_s2 = len(string2) max_len = max(len_s1, len_s2) search_range = (max_len // 2) - 1 if search_range < 0: search_range = 0 flags_s1 = [False] * len_s1 flags_s2 = [False] * len_s2 common_chars = 0 for i, ch_s1 in enumerate(string1): low = i - search_range if i > search_range else 0 hi = i + search_range if i + search_range < len_s2 else len_s2 - 1 for j in _range(low, hi + 1): if not flags_s2[j] and string2[j] == ch_s1: flags_s1[i] = flags_s2[j] = True common_chars += 1 break if not common_chars: return 0 k = trans_count = 0 for i, f_s1 in enumerate(flags_s1): if f_s1: for j in _range(k, len_s2): if flags_s2[j]: k = j + 1 break if string1[i] != string2[j]: trans_count += 1 trans_count /= 2 common_chars = float(common_chars) weight = ((common_chars / len_s1 + common_chars / len_s2 + (common_chars - trans_count) / common_chars)) / 3 return weight
def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5): """ Compute Soft-tfidf measures between two lists given the corpus information. Args: bag1,bag2 (list): Input lists corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None, the input list are considered the only corpus sim_func (func): Secondary similarity function. This should return a similarity score between two strings (optional), default is jaro similarity measure threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity of a token pair exceeds the threshold, then the token pair is considered a match. Returns: Soft TF-IDF measure between the input lists Raises: TypeError : If the inputs are not lists or if one of the inputs is None. Examples: >>> soft_tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=jaro, threshold=0.8) 0.17541160386140586 >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9) 0.5547001962252291 >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]) 0.0 >>> soft_tfidf(['aa', 'bb', 'a'], ['ab', 'ba'], sim_func=affine, threshold=0.6) 0.81649658092772592 References: * Principles of Data Integration book """ # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # if corpus is not provided treat input string as corpus if corpus_list is None: corpus_list = [bag1, bag2] corpus_size = len(corpus_list) * 1.0 # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) # number of documents an element appeared element_freq = {} # set of unique element total_unique_elements = set() for document in corpus_list: temp_set = set() for element in document: # adding element only if it is present in one of two input string if element in bag1 or element in bag2: temp_set.add(element) total_unique_elements.add(element) # update element document frequency for this document for element in temp_set: element_freq[element] = element_freq[element] + 1 if element in element_freq else 1 similarity_map = {} # calculating the term sim score against the input string 2, construct similarity map for x in bag1: if x not in similarity_map: max_score = 0.0 for y in bag2: score = sim_func(x, y) # adding sim only if it is above threshold and highest for this element if score > threshold and score > max_score: similarity_map[x] = utils.Similarity(x, y, score) max_score = score result, v_x_2, v_y_2 = 0.0, 0.0, 0.0 # soft-tfidf calculation for element in total_unique_elements: # numerator if element in similarity_map: sim = similarity_map[element] idf_first = corpus_size if sim.first_string not in element_freq else corpus_size / \ element_freq[sim.first_string] idf_second = corpus_size if sim.second_string not in element_freq else corpus_size / \ element_freq[sim.second_string] v_x = 0 if sim.first_string not in tf_x else idf_first * tf_x[sim.first_string] v_y = 0 if sim.second_string not in tf_y else idf_second * tf_y[sim.second_string] result += v_x * v_y * sim.similarity_score # denominator idf = corpus_size if element not in element_freq else corpus_size / element_freq[element] v_x = 0 if element not in tf_x else idf * tf_x[element] v_x_2 += v_x * v_x v_y = 0 if element not in tf_y else idf * tf_y[element] v_y_2 += v_y * v_y return result if v_x_2 == 0 else result / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
def tfidf(bag1, bag2, corpus_list=None, dampen=False): """ Compute tfidf measures between two lists given the corpus information. This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms. Args: bag1,bag2 (list): Input lists corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None, the input list are considered the only corpus. dampen (boolean): Flag to indicate whether 'log' should be applied to tf and idf measure. Returns: TF-IDF measure between the input lists (float) Raises: TypeError : If the inputs are not lists or if one of the inputs is None Examples: >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]) 0.17541160386140586 >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True) 0.11166746710505392 >>> tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]) 0.5547001962252291 >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]) 0.0 >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']], True) 0.0 >>> tfidf(['a', 'b', 'a'], ['a']) 0.7071067811865475 """ # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # if corpus is not provided treat input string as corpus if corpus_list is None: corpus_list = [bag1, bag2] corpus_size = len(corpus_list) # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) # number of documents an element appeared element_freq = {} # set of unique element total_unique_elements = set() for document in corpus_list: temp_set = set() for element in document: # adding element only if it is present in one of two input string if element in bag1 or element in bag2: temp_set.add(element) total_unique_elements.add(element) # update element document frequency for this document for element in temp_set: element_freq[element] = element_freq[element] + 1 if element in element_freq else 1 idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 # tfidf calculation for element in total_unique_elements: idf_element = corpus_size * 1.0 / element_freq[element] v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else ( idf_element * tf_x[element]) v_y = 0 if element not in tf_y else (math.log(idf_element) * math.log(tf_y[element] + 1)) if dampen else ( idf_element * tf_y[element]) v_x_y += v_x * v_y v_x_2 += v_x * v_x v_y_2 += v_y * v_y return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
def get_raw_score(self, set1, set2): """ Computes the Generalized Jaccard measure between two sets. This similarity measure is softened version of the Jaccard measure. The Jaccard measure is promising candidate for tokens which exactly match across the sets. However, in practice tokens are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable matching in such cases. Args: set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets. Returns: Generalized Jaccard similarity (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. ValueError : If the similarity measure doesn't return values in the range [0,1] Examples: >>> gj = GeneralizedJaccard() >>> gj.get_raw_score(['data', 'science'], ['data']) 0.5 >>> gj.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 0.3333333333333333 >>> gj.get_raw_score(['Niall'], ['Neal', 'Njall']) 0.43333333333333335 >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8) >>> gj.get_raw_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 0.45810185185185187 """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) set1_x = set() set2_y = set() match_score = 0.0 match_count = 0 list_matches = [] for element in set1: for item in set2: score = self.sim_func(element, item) if score > 1 or score < 0: raise ValueError('Similarity measure should' + \ ' return value in the range [0,1]') if score > self.threshold: list_matches.append((element, item, score)) # position of first string, second string and sim score in tuple first_string_pos = 0 second_string_pos = 1 sim_score_pos = 2 # sort the score of all the pairs list_matches.sort(key=lambda x: x[sim_score_pos], reverse=True) # select score in increasing order of their weightage, # do not reselect the same element from either set. for element in list_matches: if (element[first_string_pos] not in set1_x and element[second_string_pos] not in set2_y): set1_x.add(element[first_string_pos]) set2_y.add(element[second_string_pos]) match_score += element[sim_score_pos] match_count += 1 return float(match_score) / float(len(set1) + len(set2) - match_count)
def generalized_jaccard(set1, set2, sim_func=jaro, threshold=0.5): """ Computes the Generalized Jaccard measure between two sets. This similarity measure is softened version of the Jaccard measure. The Jaccard measure is promising candidate for tokens which exactly match across the sets. However, in practice tokens are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable matching in such cases. Args: set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets. sim_func (func): similarity function. This should return a similarity score between two strings in set (optional), default is jaro similarity measure threshold (float): Threshold value (defaults to 0.5). If the similarity of a token pair exceeds the threshold, then the token pair is considered a match. Returns: Generalized Jaccard similarity (float) Raises: TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. ValueError : If the similarity measure doesn't return values in the range [0.1] Examples: >>> generalized_jaccard(['data', 'science'], ['data']) 0.5 >>> generalized_jaccard(['data', 'management'], ['data', 'data', 'science']) 0.3333333333333333 >>> generalized_jaccard(['Niall'], ['Neal', 'Njall']) 0.43333333333333335 >>> generalized_jaccard(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=jaro_winkler, threshold=0.8) 0.45810185185185187 """ # input validations utils.sim_check_for_none(set1, set2) utils.sim_check_for_list_or_set_inputs(set1, set2) # if exact match return 1.0 if utils.sim_check_for_exact_match(set1, set2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(set1, set2): return 0 if not isinstance(set1, set): set1 = set(set1) if not isinstance(set2, set): set2 = set(set2) set1_x = set() set2_y = set() match_score = 0.0 match_count = 0 list_matches = [] for element in set1: for item in set2: score = sim_func(element, item) if score > 1 or score < 0: raise ValueError('Similarity measure should return value in the range [0,1]') if score > threshold: list_matches.append(utils.Similarity(element, item, score)) # sort the score of all the pairs list_matches.sort(key=lambda x: x.similarity_score, reverse=True) # select score in increasing order of their weightage, do not reselect the same element from either set. for element in list_matches: if element.first_string not in set1_x and element.second_string not in set2_y: set1_x.add(element.first_string) set2_y.add(element.second_string) match_score += element.similarity_score match_count += 1 return float(match_score) / float(len(set1) + len(set2) - match_count)
def get_raw_score(self, string1, string2): """Computes the affine gap score between two strings. This score can be outside the range [0,1]. Args: string1,string2 (str) : Input strings. Returns: Affine gap score betwen the two input strings (float). Raises: TypeError : If the inputs are not strings or if one of the inputs is None. Examples: >>> aff = Affine() >>> aff.get_raw_score('dva', 'deeva') 1.5 >>> aff = Affine(gap_start=2, gap_continuation=0.5) >>> aff.get_raw_score('dva', 'deeve') -0.5 >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 4.4 """ # input validations utils.sim_check_for_none(string1, string2) # convert input to unicode. string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) utils.tok_check_for_string_input(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 gap_start = -self.gap_start gap_continuation = -self.gap_continuation m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) # DP initialization for i in xrange(1, len(string1) + 1): m[i][0] = -float("inf") x[i][0] = gap_start + (i - 1) * gap_continuation y[i][0] = -float("inf") # DP initialization for j in xrange(1, len(string2) + 1): m[0][j] = -float("inf") x[0][j] = -float("inf") y[0][j] = gap_start + (j - 1) * gap_continuation # affine gap calculation using DP for i in xrange(1, len(string1) + 1): for j in xrange(1, len(string2) + 1): # best score between x_1....x_i and y_1....y_j # given that x_i is aligned to y_j m[i][j] = (self.sim_func(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])) # the best score given that x_i is aligned to a gap x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) # the best score given that y_j is aligned to a gap y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
def get_raw_score(self, bag1, bag2): """Computes the raw TF/IDF score between two lists. Args: bag1,bag2 (list): Input lists. Returns: TF/IDF score between the input lists (float). Raises: TypeError : If the inputs are not lists or if one of the inputs is None. Examples: >>> # here the corpus is a list of three strings that >>> # have been tokenized into three lists of tokens >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']]) >>> tfidf.get_raw_score(['a', 'b', 'a'], ['b', 'c']) 0.7071067811865475 >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.0 >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']]) >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.0 >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False) >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c']) 0.25298221281347033 >>> tfidf = TfIdf(dampen=False) >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.7071067811865475 >>> tfidf = TfIdf() >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.0 """ # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) # find unique elements in the input lists and their document frequency local_df = {} for element in tf_x: local_df[element] = local_df.get(element, 0) + 1 for element in tf_y: local_df[element] = local_df.get(element, 0) + 1 # if corpus is not provided treat input string as corpus curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else ( (self.__document_frequency, self.__corpus_size)) idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0) # tfidf calculation for element in local_df.keys(): df_element = curr_df.get(element) if df_element is None: continue idf_element = corpus_size * 1.0 / df_element v_x = 0 if element not in tf_x else (log(idf_element) * log(tf_x[element] + 1)) if self.dampen else ( idf_element * tf_x[element]) v_y = 0 if element not in tf_y else (log(idf_element) * log(tf_y[element] + 1)) if self.dampen else ( idf_element * tf_y[element]) v_x_y += v_x * v_y v_x_2 += v_x * v_x v_y_2 += v_y * v_y return 0.0 if v_x_y == 0 else v_x_y / (sqrt(v_x_2) * sqrt(v_y_2))
def get_raw_score(self, bag1, bag2): """Computes the raw soft TF/IDF score between two lists given the corpus information. Args: bag1,bag2 (list): Input lists Returns: Soft TF/IDF score between the input lists (float). Raises: TypeError : If the inputs are not lists or if one of the inputs is None. Examples: >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8) >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c']) 0.17541160386140586 >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9) >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.5547001962252291 >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']]) >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 0.0 >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6) >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba']) 0.81649658092772592 References: * the string matching chapter of the "Principles of Data Integration" book. """ # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) # find unique elements in the input lists and their document frequency local_df = {} for element in tf_x: local_df[element] = local_df.get(element, 0) + 1 for element in tf_y: local_df[element] = local_df.get(element, 0) + 1 # if corpus is not provided treat input string as corpus curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else ( (self.__document_frequency, self.__corpus_size)) # calculating the term sim score against the input string 2, # construct similarity map similarity_map = {} for term_x in tf_x: max_score = 0.0 for term_y in tf_y: score = self.sim_func(term_x, term_y) # adding sim only if it is above threshold and # highest for this element if score > self.threshold and score > max_score: similarity_map[term_x] = (term_x, term_y, score) max_score = score # position of first string, second string and sim score # in the tuple first_string_pos = 0 second_string_pos = 1 sim_score_pos = 2 result, v_x_2, v_y_2 = 0.0, 0.0, 0.0 # soft-tfidf calculation for element in local_df.keys(): if curr_df.get(element) is None: continue # numerator if element in similarity_map: sim = similarity_map[element] idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1) idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1) v_x = idf_first * tf_x.get(sim[first_string_pos], 0) v_y = idf_second * tf_y.get(sim[second_string_pos], 0) result += v_x * v_y * sim[sim_score_pos] # denominator idf = corpus_size / curr_df[element] v_x = idf * tf_x.get(element, 0) v_x_2 += v_x * v_x v_y = idf * tf_y.get(element, 0) v_y_2 += v_y * v_y return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
def get_raw_score(self, string1, string2): """ Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings. This score is in the range [0,100]. Args: string1,string2 (str): Input strings Returns: Partial Ratio measure raw score (int) is returned Raises: TypeError: If the inputs are not strings Examples: >>> s = PartialRatio() >>> s.get_raw_score('Robert Rupert', 'Rupert') 100 >>> s.get_raw_score('Sue', 'sue') 67 >>> s.get_raw_score('example', 'samples') 86 References: * https://pypi.python.org/pypi/fuzzywuzzy """ # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 string1 = utils.convert_to_unicode(string1) string2 = utils.convert_to_unicode(string2) # string1 should be smaller in length than string2. If this is not the case # then swap string1 and string2 if len(string1) > len(string2): temp = string1 string1 = string2 string2 = temp sm = SequenceMatcher(None, string1, string2) matching_blocks = sm.get_matching_blocks() scores = [] for block in matching_blocks: string2_starting_index = 0 if (block[1] - block[0] > 0): string2_starting_index = block[1] - block[0] string2_ending_index = string2_starting_index + len(string1) string2_substr = string2[string2_starting_index:string2_ending_index] sm2 = SequenceMatcher(None, string1, string2_substr) similarity_ratio = sm2.ratio() if similarity_ratio > .995: return 100 else: scores.append(similarity_ratio) return int(round(100 * max(scores)))