示例#1
0
def smith_waterman(s1, s2):
    """
    This function computes the Smith-Waterman measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Smith-Waterman measure if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.smith_waterman('cat', 'hat')
        2.0
        >>> em.smith_waterman('cat', None)
        nan
    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.SmithWaterman()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
示例#2
0
def jaro_winkler(s1, s2):
    """
    This function computes the Jaro Winkler measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Jaro Winkler measure if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.jaro_winkler('MARTHA', 'MARHTA')
        0.9611111111111111
        >>> >>> em.jaro_winkler('MARTHA', None)
        nan

    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.JaroWinkler()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)
示例#3
0
def lev_sim(s1, s2):
    """
    This function computes the Levenshtein similarity between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Levenshtein similarity if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.lev_sim('alex', 'alxe')
        0.5
        >>> em.lev_dist(None, 'alex')
        nan

    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.Levenshtein()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_sim_score(s1, s2)
def tok_alphanumeric(input_string):
    """
    This function returns a list of tokens that are maximal sequences of
    consecutive alphanumeric characters.

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_alphanumeric('data9,(science), data9#.(integration).88')
        ['data9', 'science', 'data9', 'integration', '88']
        >>> em.tok_alphanumeric('#.$')
        []
        >>> em.tok_alphanumeric(None)
        nan

    """
    if pd.isnull(input_string):
        return pd.np.NaN

    input_string = gh.convert_to_str_unicode(input_string)

    measure = sm.AlphanumericTokenizer()
    return measure.tokenize(input_string)
def tok_wspace(input_string):
    """
    This function splits the input string into a list of tokens
    (based on the white space).

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_wspace('data science')
        ['data', 'science']
        >>> em.tok_wspace('data         science')
        ['data', 'science']
        >>> em.tok_wspace(None)
        nan


    """
    if pd.isnull(input_string):
        return pd.np.NaN

    # input_string = remove_non_ascii(input_string)
    input_string = gh.convert_to_str_unicode(input_string)

    measure = sm.WhitespaceTokenizer()
    return measure.tokenize(input_string)
def tok_delim(input_string, d):
    """
    This function splits the input string into a list of tokens
    (based on the delimiter).

    Args:
        input_string (string): Input string that should be tokenized.
        d (string): Delimiter string.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_delim('data science', ' ')
        ['data', 'science']
        >>> em.tok_delim('data$#$science', '$#$')
        ['data', 'science']
        >>> em.tok_delim(None, ' ')
        nan


    """

    if pd.isnull(input_string):
        return pd.np.NaN

    input_string = gh.convert_to_str_unicode(input_string)

    measure = sm.DelimiterTokenizer(delim_set=[d])

    return measure.tokenize(input_string)
def tok_qgram(input_string, q):
    """
    This function splits the input string into a list of q-grams. Note that,
    by default the input strings are padded and then tokenized.

    Args:
        input_string (string): Input string that should be tokenized.
        q (int): q-val that should be used to tokenize the input string.

    Returns:
        A list of tokens, if the input string is not NaN,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_qgram('database', q=2)
        ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
        >>> em.tok_qgram('database', q=3)
        ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
        >>> em.tok_qgram(None, q=2)
        nan
    """

    if pd.isnull(input_string):
        return pd.np.NaN

    input_string = gh.convert_to_str_unicode(input_string)
    measure = sm.QgramTokenizer(qval=q)

    return measure.tokenize(input_string)
    def tok_qgram(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s

        s = gh.convert_to_str_unicode(s)

        measure = sm.QgramTokenizer(qval=q)
        return measure.tokenize(s)
示例#9
0
def affine(s1, s2):
    """
    This function computes the affine measure between the two input strings.

    Args:
        s1,s2 (string ): The input strings for which the similarity measure
            should be computed.

    Returns:
        The affine measure if both the strings are not missing (i.e NaN or
        None), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.affine('dva', 'deeva')
        1.5
        >>> em.affine(None, 'deeva')
        nan
    """
    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.Affine()

    # if not isinstance(s1, six.string_types):
    #     s1 = six.u(str(s1))
    #
    # if isinstance(s1, bytes):
    #     s1 = s1.decode('utf-8', 'ignore')
    #
    # if not isinstance(s2, six.string_types):
    #     s2 = six.u(str(s2))
    #
    # if isinstance(s2, bytes):
    #     s2 = s2.decode('utf-8', 'ignore')

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity
    return measure.get_raw_score(s1, s2)
    def tok_delim(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s
        # Remove non ascii  characters. Note: This should be fixed in the
        # next version.
        #s = remove_non_ascii(s)

        s = gh.convert_to_str_unicode(s)

        # Initialize the tokenizer measure object
        measure = sm.DelimiterTokenizer(delim_set=[d])
        # Call the function that will tokenize the input string.
        return measure.tokenize(s)
示例#11
0
def hamming_dist(s1, s2):
    """
    This function computes the Hamming distance between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Hamming distance if both the strings are not missing (i.e NaN),
        else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.hamming_dist('alex', 'john')
        4
        >>> em.hamming_dist(None, 'john')
        nan


    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.HammingDistance()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)


    # Call the function to compute the distance
    return measure.get_raw_score(s1, s2)
示例#12
0
def needleman_wunsch(s1, s2):
    """
    This function computes the Needleman-Wunsch measure between the two input
    strings.

    Args:
        s1,s2 (string): The input strings for which the similarity measure should
            be computed.

    Returns:
        The Needleman-Wunsch measure if both the strings are not missing (i.e
        NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.needleman_wunsch('dva', 'deeva')
        1.0
        >>> em.needleman_wunsch('dva', None)
        nan


    """

    if s1 is None or s2 is None:
        return pd.np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return pd.np.NaN

    # Create the similarity measure object
    measure = sm.NeedlemanWunsch()

    s1 = gh.convert_to_str_unicode(s1)
    s2 = gh.convert_to_str_unicode(s2)

    # Call the function to compute the similarity measure
    return measure.get_raw_score(s1, s2)