def jaro(s1, s2): """ This function computes the Jaro measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Jaro measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.jaro('MARTHA', 'MARHTA') 0.9444444444444445 >>> em.jaro(None, 'MARTHA') nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.Jaro() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def jaro(s1, s2): """ This function computes the Jaro measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Jaro measure if both the strings are not missing (i.e NaN), else returns NaN. """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.Jaro() if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def textdistance_jaro_winkler_distance(candidates, inp, min_score, winkler): res = [] fun = py_stringmatching.JaroWinkler().get_raw_score if winkler else py_stringmatching.Jaro().get_raw_score for candidate in candidates: score = fun(candidate, inp) if score >= min_score: res.append((candidate, score)) return res
def jaro(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN s1 = helper.convert_to_str_unicode(s1) s2 = helper.convert_to_str_unicode(s2) measure = sm.Jaro() return measure.get_raw_score(s1, s2)
def extract_jaro_distance(queried_name, predicted_name): jw = sm.Jaro() res = np.empty(len(queried_name), dtype=float) for i in tqdm(range(len(queried_name))): try: # res[i] = distance.get_jaro_distance(queried_name[i], predicted_name[i], winkler=False, scaling=0.1) # res[i] = jaro.jaro_metric(queried_name[i], predicted_name[i]) res[i] = jw.get_raw_score(queried_name[i], predicted_name[i]) except: print(i) return res
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def jaro(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.Jaro() if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
get_ipython().system('pip install py_stringmatching') import py_stringmatching as sm # # Token Based Similarities # In[27]: jac = sm.Jaccard() df['Jaccard'] = df.apply( lambda x: jac.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[28]: jaro = sm.Jaro() # !pip install pyjarowinkler # from pyjarowinkler import distance # def jaro_similarity(word1, word2): # return distance.get_jaro_distance(word1, word2, winkler=False, scaling=0.1) def jaccard_similarity_general(tokens1, tokens2): intersection = [] for token1 in list(set(tokens1)): for token2 in list(set(tokens2)): if jaro.get_sim_score(token1, token2) > 0.7: if token1 not in intersection: intersection.append(token1) if token2 not in intersection:
# iteration #2: trim whitespaces from artist and track labels row[3] = row[3].strip() row[4] = row[4].strip() row[7] = row[7].strip() row[8] = row[8].strip() sampledList.append(row) f.close() # Converting every row in to a feature vector featList = [] label = [] ws = ps.WhitespaceTokenizer() for item in sampledList: fi = [] jaro1 = ps.Jaro() # iteration #3: # pull the feature value to zero if none of the token pairs from either artist strings have a high # enough similarity score f1 = 0 for t1 in ws.tokenize(item[3]): if max([jaro1.get_raw_score(t1, t2) for t2 in ws.tokenize(item[7])]) > .75: f1 = jaro1.get_raw_score(item[3], item[7]) break # iteration #3: # if the artist doesn't match scale down the track similarity by a factor of 3 # and if the track score isn't high enough pull it down to 0 jaro2 = ps.Jaro() f2 = jaro1.get_raw_score(item[4], item[8])