def smith_waterman(s1, s2):
    """
    This function computes the Smith-Waterman measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Smith-Waterman measure if both the strings are not missing (i.e
        NaN), else  returns NaN.
    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.SmithWaterman()
    if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
Пример #2
0
def smith_waterman(s1, s2):
    """
    This function computes the Smith-Waterman measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Smith-Waterman measure if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.smith_waterman('cat', 'hat')
        2.0
        >>> em.smith_waterman('cat', None)
        nan
    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.SmithWaterman()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
Пример #3
0
def smith_waterman(s1, s2):
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    s1 = helper.convert_to_str_unicode(s1)
    s2 = helper.convert_to_str_unicode(s2)

    measure = sm.SmithWaterman()
    return measure.get_raw_score(s1, s2)
Пример #4
0
def smith_waterman(s1, s2):
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.SmithWaterman()
    if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return (measure.get_raw_score(s1, s2)/min(len(s1), len(s2)))
Пример #5
0
lev = sm.Levenshtein()
df['Levenshtein'] = df.apply(
    lambda x: lev.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# In[43]:

nw = sm.NeedlemanWunsch()
df['NeedlemanWunsch'] = df.apply(
    lambda x: nw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# In[44]:

sw = sm.SmithWaterman()
df['SmithWaterman'] = df.apply(
    lambda x: sw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# ### The following data is used to quickly establish the performance of models (NOT THE FINAL CODE)

# In[45]:

df.to_csv(open("src/data/TokenizedNewData.csv", 'w'), sep=',')

# In[46]:

dt = pd.read_csv("src/data/TokenizedNewData.csv")

# In[47]:
Пример #6
0
def compute_simScore(str1, str2):
    # compute similarity score between str1 and str2 using Smith Waterman measure
    sw = sm.SmithWaterman()
    return sw.get_raw_score(str1, str2) / min(len(str1), len(str2))
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        True)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(),
                              sm.WhitespaceTokenizer(return_set=True))
jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(),
                                sm.QgramTokenizer(qval=3, return_set=True))
dice = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                               sm.WhitespaceTokenizer(return_set=True))
diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                                 sm.QgramTokenizer(qval=3, return_set=True))
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(),
                                 sm.QgramTokenizer(return_set=True))
LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())
sw = FVC.stringMatchTitles('SW', sm.SmithWaterman())
nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch())
jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())


def writeToCSV(fileName, header, tableList):
    wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL)
    wr.writerow(header)
    for row in tableList:
        wr.writerow(row)


# Given a set of feature vector components, records precision and recall over several
# classifiers.  Records output to a table and vertical bar plot.