def smith_waterman(s1, s2): """ This function computes the Smith-Waterman measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Smith-Waterman measure if both the strings are not missing (i.e NaN), else returns NaN. """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.SmithWaterman() if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def smith_waterman(s1, s2): """ This function computes the Smith-Waterman measure between the two input strings. Args: s1,s2 (string): The input strings for which the similarity measure should be computed. Returns: The Smith-Waterman measure if both the strings are not missing (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.smith_waterman('cat', 'hat') 2.0 >>> em.smith_waterman('cat', None) nan """ if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # Create the similarity measure object measure = sm.SmithWaterman() s1 = gh.convert_to_str_unicode(s1) s2 = gh.convert_to_str_unicode(s2) # Call the function to compute the similarity measure return measure.get_raw_score(s1, s2)
def smith_waterman(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN s1 = helper.convert_to_str_unicode(s1) s2 = helper.convert_to_str_unicode(s2) measure = sm.SmithWaterman() return measure.get_raw_score(s1, s2)
def smith_waterman(s1, s2): if s1 is None or s2 is None: return pd.np.NaN if pd.isnull(s1) or pd.isnull(s2): return pd.np.NaN # if isinstance(s1, six.string_types): # s1 = gh.remove_non_ascii(s1) # if isinstance(s2, six.string_types): # s2 = gh.remove_non_ascii(s2) # Create the similarity measure object measure = sm.SmithWaterman() if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)): s1 = str(s1) if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)): s2 = str(s2) # Call the function to compute the similarity measure return (measure.get_raw_score(s1, s2)/min(len(s1), len(s2)))
lev = sm.Levenshtein() df['Levenshtein'] = df.apply( lambda x: lev.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[43]: nw = sm.NeedlemanWunsch() df['NeedlemanWunsch'] = df.apply( lambda x: nw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[44]: sw = sm.SmithWaterman() df['SmithWaterman'] = df.apply( lambda x: sw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # ### The following data is used to quickly establish the performance of models (NOT THE FINAL CODE) # In[45]: df.to_csv(open("src/data/TokenizedNewData.csv", 'w'), sep=',') # In[46]: dt = pd.read_csv("src/data/TokenizedNewData.csv") # In[47]:
def compute_simScore(str1, str2): # compute similarity score between str1 and str2 using Smith Waterman measure sw = sm.SmithWaterman() return sw.get_raw_score(str1, str2) / min(len(str1), len(str2))
TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), True) jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(), sm.WhitespaceTokenizer(return_set=True)) jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(), sm.QgramTokenizer(qval=3, return_set=True)) dice = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.WhitespaceTokenizer(return_set=True)) diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.QgramTokenizer(qval=3, return_set=True)) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(), sm.QgramTokenizer(return_set=True)) LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) sw = FVC.stringMatchTitles('SW', sm.SmithWaterman()) nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch()) jw = FVC.stringMatchTitles('JW', sm.JaroWinkler()) def writeToCSV(fileName, header, tableList): wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL) wr.writerow(header) for row in tableList: wr.writerow(row) # Given a set of feature vector components, records precision and recall over several # classifiers. Records output to a table and vertical bar plot.