def needleman_wunsch(s1, s2):
    """
    This function computes the Needleman-Wunsch measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Needleman-Wunsch measure if both the strings are not missing (i.e
        NaN), else  returns NaN.

    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.NeedlemanWunsch()
    if not (isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not (isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
Exemplo n.º 2
0
def needleman_wunsch(s1, s2):
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    s1 = helper.convert_to_str_unicode(s1)
    s2 = helper.convert_to_str_unicode(s2)

    measure = sm.NeedlemanWunsch()
    return measure.get_raw_score(s1, s2)
Exemplo n.º 3
0
def needleman_wunsch(s1, s2):

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN
    # if isinstance(s1, six.string_types):
    #     s1 = gh.remove_non_ascii(s1)
    # if isinstance(s2, six.string_types):
    #     s2 = gh.remove_non_ascii(s2)
    # Create the similarity measure object
    measure = sm.NeedlemanWunsch()
    if not(isinstance(s1, six.string_types) or isinstance(s1, bytes)):
        s1 = str(s1)

    if not(isinstance(s2, six.string_types) or isinstance(s2, bytes)):
        s2 = str(s2)

    # Call the function to compute the similarity measure
    return nw_norm(s1, s2)
Exemplo n.º 4
0
def needleman_wunsch(s1, s2):
    """
    This function computes the Needleman-Wunsch measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Needleman-Wunsch measure if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.needleman_wunsch('dva', 'deeva')
        1.0
        >>> em.needleman_wunsch('dva', None)
        nan


    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.NeedlemanWunsch()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
Exemplo n.º 5
0
jaro = sm.Jaro()
df['Jaro'] = df.apply(
    lambda x: jaro.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# In[42]:

lev = sm.Levenshtein()
df['Levenshtein'] = df.apply(
    lambda x: lev.get_sim_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# In[43]:

nw = sm.NeedlemanWunsch()
df['NeedlemanWunsch'] = df.apply(
    lambda x: nw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# In[44]:

sw = sm.SmithWaterman()
df['SmithWaterman'] = df.apply(
    lambda x: sw.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# ### The following data is used to quickly establish the performance of models (NOT THE FINAL CODE)

# In[45]:
                        True)
jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(),
                              sm.WhitespaceTokenizer(return_set=True))
jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(),
                                sm.QgramTokenizer(qval=3, return_set=True))
dice = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                               sm.WhitespaceTokenizer(return_set=True))
diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(),
                                 sm.QgramTokenizer(qval=3, return_set=True))
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(),
                                 sm.QgramTokenizer(return_set=True))
LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())
sw = FVC.stringMatchTitles('SW', sm.SmithWaterman())
nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch())
jw = FVC.stringMatchTitles('JW', sm.JaroWinkler())


def writeToCSV(fileName, header, tableList):
    wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL)
    wr.writerow(header)
    for row in tableList:
        wr.writerow(row)


# Given a set of feature vector components, records precision and recall over several
# classifiers.  Records output to a table and vertical bar plot.


def modelExperiment(insampleData,