示例#1
0
文件: soundex.py 项目: Chinmay26/rltk
def _soundex(s):
    """
    The standard used for this implementation is provided by `U.S. Census Bureau <https://www.archives.gov/research/census/soundex.html>`_.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.soundex('ashcraft')
        'A261'
        >>> rltk.soundex('pineapple')
        'P514'
    """

    utils.check_for_none(s)
    utils.check_for_type(basestring, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.upper()

    CODES = (
        ('BFPV', '1'),
        ('CGJKQSXZ', '2'),
        ('DT', '3'),
        ('L', '4'),
        ('MN', '5'),
        ('R', '6'),
        ('AEIOUHWY', '.')  # placeholder
    )
    CODE_DICT = dict((c, replace) for chars, replace in CODES for c in chars)

    sdx = s[0]
    for i in xrange(1, len(s)):
        if s[i] not in CODE_DICT:
            continue

        code = CODE_DICT[s[i]]
        if code == '.':
            continue
        if s[i] == s[i - 1]:  # ignore same letter
            continue
        if s[i - 1] in CODE_DICT and CODE_DICT[s[
                i - 1]] == code:  # 'side-by-side' rule
            continue
        if s[i-1] in ('H', 'W') and i - 2 > 0 and\
                        s[i-2] in CODE_DICT and CODE_DICT[s[i-2]] != '.': # consonant separators
            continue

        sdx += code

    sdx = sdx[0:4].ljust(4, '0')

    return sdx
示例#2
0
def needleman_wunsch_score(s1,
                           s2,
                           match=2,
                           mismatch=-1,
                           gap=-0.5,
                           score_table={}):

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    if n1 == 0 and n2 == 0:
        return 0

    # construct matrix to get max score of all possible alignments
    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
    for i in xrange(n1 + 1):
        for j in xrange(n2 + 1):
            if i == 0 and j == 0:  # [0,0]
                continue
            elif i == 0:  # most top row
                dp[i][j] = gap + dp[i][j - 1]
            elif j == 0:  # most left column
                dp[i][j] = gap + dp[i - 1][j]
            else:
                dp[i][j] = max(
                    dp[i][j - 1] + gap,
                    dp[i - 1][j] + gap, dp[i - 1][j - 1] + _get_score(
                        s1[i - 1], s2[j - 1], match, mismatch, score_table))

    return dp[n1][n2]
示例#3
0
def hybrid_jaccard_similarity(set1,
                              set2,
                              threshold=0.5,
                              function=jaro_winkler_similarity,
                              parameters={}):

    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    matching_score = []
    for s1 in set1:
        inner = []
        for s2 in set2:
            score = function(s1, s2, **parameters)
            if score < threshold:
                score = 0.0
            inner.append(1.0 - score)  # munkres finds out the smallest element
        matching_score.append(inner)

    indexes = munkres.Munkres().compute(matching_score)

    score_sum, matching_count = 0.0, 0
    for r, c in indexes:
        matching_count += 1
        score_sum += 1.0 - matching_score[r][c]  # go back to similarity

    if len(set1) + len(set2) - matching_count == 0:
        return 1.0
    return float(score_sum) / float(len(set1) + len(set2) - matching_count)
示例#4
0
def _jaccard_index(set1, set2):
    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    if len(set1) == 0 or len(set2) == 0:
        return 0

    return float(len(set1 & set2)) / float(len(set1 | set2))
示例#5
0
文件: dice.py 项目: Chinmay26/rltk
def dice_similarity(set1, set2):

    utils.check_for_none(set1, set2)
    utils.check_for_type(set, set1, set2)

    if len(set1) == 0 or len(set2) == 0:
        return 0

    return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
示例#6
0
def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False):
    """
    Computes TF/IDF measure. This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms.

    Args:
        bag1 (list): Bag 1.
        bag2 (list): Bag 2.
        df_corpus (dict): The pre calculated document frequency of corpus.
        doc_size (int): total documents used in corpus.
        math_log (bool, optional): Flag to indicate whether math.log() should be used in TF and IDF formulas. Defaults to False.

    Returns:
        float: TF/IDF cosine similarity.

    Examples:
        >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':1, 'c':1}, 3)
        0.17541160386140586
        >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':2, 'c':1}, 4, True)
        0.12977804138
        >>> rltk.tfidf(['a', 'b', 'a'], ['a'], {'a':3, 'b':1, 'c':1}, 3)
        0.5547001962252291
    """
    # http://www.tfidf.com/

    utils.check_for_none(bag1, bag2, df_corpus)
    utils.check_for_type(list, bag1, bag2)

    # term frequency for input strings
    t_x, t_y = collections.Counter(bag1), collections.Counter(bag2)
    tf_x = {k: float(v) / len(bag1) for k, v in t_x.iteritems()}
    tf_y = {k: float(v) / len(bag2) for k, v in t_y.iteritems()}

    # unique element
    total_unique_elements = set()
    total_unique_elements.update(bag1)
    total_unique_elements.update(bag2)

    idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

    # tfidf calculation
    for element in total_unique_elements:
        if element not in df_corpus:
            continue
        idf_element = doc_size * 1.0 / df_corpus[element]

        v_x = 0 if element not in tf_x else (math.log(idf_element) * tf_x[element]) if math_log else (
            idf_element * tf_x[element])
        v_y = 0 if element not in tf_y else (math.log(idf_element) * tf_y[element]) if math_log else (
            idf_element * tf_y[element])
        v_x_y += v_x * v_y
        v_x_2 += v_x * v_x
        v_y_2 += v_y * v_y

    # cosine similarity
    return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
示例#7
0
def damerau_levenshtein_distance(s1, s2):
    """
    Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a transposition of two adjacent characters.

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.

    Returns:
        float: Damerau Levenshtein Distance.

    Examples:
        >>> rltk.damerau_levenshtein_distance('abcd', 'acbd')
        1
        >>> rltk.damerau_levenshtein_distance('abbd', 'acad')
        2
    """

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    infinite = n1 + n2

    char_arr = defaultdict(int)
    dp = [[0] * (n2 + 2) for _ in xrange(n1 + 2)]

    dp[0][0] = infinite
    for i in xrange(0, n1 + 1):
        dp[i + 1][0] = infinite
        dp[i + 1][1] = i
    for i in xrange(0, n2 + 1):
        dp[0][i + 1] = infinite
        dp[1][i + 1] = i

    for i in xrange(1, n1 + 1):
        db = 0
        for j in xrange(1, n2 + 1):
            i1 = char_arr[s2[j - 1]]
            j1 = db
            cost = 1
            if s1[i - 1] == s2[j - 1]:
                cost = 0
                db = j

            dp[i + 1][j + 1] = min(
                dp[i][j] + cost, dp[i + 1][j] + 1, dp[i][j + 1] + 1,
                dp[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1))
        char_arr[s1[i - 1]] = i

    return dp[n1 + 1][n2 + 1]
示例#8
0
文件: hamming.py 项目: Chinmay26/rltk
def hamming_distance(s1, s2):

    utils.check_for_none(s1, s2)
    # utils.check_for_type(basestring, s1, s2)

    if type(s1) != type(s2):
        raise TypeError('Different type')

    if isinstance(s1, basestring) and isinstance(s2, basestring):
        s1 = utils.unicode_normalize(s1)
        s2 = utils.unicode_normalize(s2)

    if len(s1) != len(s2):
        raise ValueError('Unequal length')

    return sum(c1 != c2 for c1, c2 in zip(s1, s2))
示例#9
0
def monge_elkan_similarity(bag1,
                           bag2,
                           function=jaro_winkler_similarity,
                           parameters={}):

    utils.check_for_none(bag1, bag2)
    utils.check_for_type(list, bag1, bag2)

    if len(bag1) == 0:
        return 0.0

    score_sum = 0
    for ele1 in bag1:
        max_score = MIN_FLOAT
        for ele2 in bag2:
            max_score = max(max_score, function(ele1, ele2, **parameters))
        score_sum += max_score

    return float(score_sum) / float(len(bag1))
示例#10
0
文件: jaro.py 项目: Chinmay26/rltk
def _jaro_distance(s1, s2):
    # code from https://github.com/nap/jaro-winkler-distance
    # Copyright Jean-Bernard Ratte

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    shorter, longer = s1.lower(), s2.lower()

    if len(s1) > len(s2):
        longer, shorter = shorter, longer

    m1 = _get_matching_characters(shorter, longer)
    m2 = _get_matching_characters(longer, shorter)

    if len(m1) == 0 or len(m2) == 0:
        return 0.0

    return (float(len(m1)) / len(shorter) + float(len(m2)) / len(longer) +
            float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0
示例#11
0
def _nysiis(s):
    """
    New York State Immunization Information System (NYSIIS) Phonetic Code is a phonetic algorithm created by `The New York State Department of Health's (NYSDOH) Bureau of Immunization
    <https://www.health.ny.gov/prevention/immunization/information_system/>`_.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.metaphone('ashcraft')
        'AXKRFT'
        >>> rltk.metaphone('pineapple')
        'PNPL'
    """
    # code from https://github.com/jamesturk/jellyfish
    # Copyright (c) 2015, James Turk
    # Copyright (c) 2015, Sunlight Foundation
    # All rights reserved.

    utils.check_for_none(s)
    utils.check_for_type(basestring, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.upper()
    key = []

    # step 1 - prefixes
    if s.startswith('MAC'):
        s = 'MCC' + s[3:]
    elif s.startswith('KN'):
        s = s[1:]
    elif s.startswith('K'):
        s = 'C' + s[1:]
    elif s.startswith(('PH', 'PF')):
        s = 'FF' + s[2:]
    elif s.startswith('SCH'):
        s = 'SSS' + s[3:]

    # step 2 - suffixes
    if s.endswith(('IE', 'EE')):
        s = s[:-2] + 'Y'
    elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
        s = s[:-2] + 'D'

    # step 3 - first character of key comes from name
    key.append(s[0])

    # step 4 - translate remaining chars
    i = 1
    len_s = len(s)
    while i < len_s:
        ch = s[i]
        if ch == 'E' and i + 1 < len_s and s[i + 1] == 'V':
            ch = 'AF'
            i += 1
        elif ch in 'AEIOU':
            ch = 'A'
        elif ch == 'Q':
            ch = 'G'
        elif ch == 'Z':
            ch = 'S'
        elif ch == 'M':
            ch = 'N'
        elif ch == 'K':
            if i + 1 < len(s) and s[i + 1] == 'N':
                ch = 'N'
            else:
                ch = 'C'
        elif ch == 'S' and s[i + 1:i + 3] == 'CH':
            ch = 'SS'
            i += 2
        elif ch == 'P' and i + 1 < len(s) and s[i + 1] == 'H':
            ch = 'F'
            i += 1
        elif ch == 'H' and (s[i - 1] not in 'AEIOU' or
                            (i + 1 < len(s) and s[i + 1] not in 'AEIOU')):
            if s[i - 1] in 'AEIOU':
                ch = 'A'
            else:
                ch = s[i - 1]
        elif ch == 'W' and s[i - 1] in 'AEIOU':
            ch = s[i - 1]

        if ch[-1] != key[-1][-1]:
            key.append(ch)

        i += 1

    key = ''.join(key)

    # step 5 - remove trailing S
    if key.endswith('S') and key != 'S':
        key = key[:-1]

    # step 6 - replace AY w/ Y
    if key.endswith('AY'):
        key = key[:-2] + 'Y'

    # step 7 - remove trailing A
    if key.endswith('A') and key != 'A':
        key = key[:-1]

    # step 8 was already done

    return key
示例#12
0
def _metaphone(s):
    """
    Metaphone fundamentally improves on the Soundex algorithm by using information about variations and inconsistencies in English spelling and pronunciation to produce a more accurate encoding, which does a better job of matching words and names which sound similar. As with Soundex, similar-sounding words should share the same keys. Metaphone is available as a built-in operator in a number of systems.

    Args:
        s (str): Sequence.

    Returns:
        str: Coded sequence.

    Examples:
        >>> rltk.metaphone('ashcraft')
        'AXKRFT'
        >>> rltk.metaphone('pineapple')
        'PNPL'
    """
    # code from https://github.com/jamesturk/jellyfish
    # Copyright (c) 2015, James Turk
    # Copyright (c) 2015, Sunlight Foundation
    # All rights reserved.

    utils.check_for_none(s)
    utils.check_for_type(basestring, s)

    s = utils.unicode_normalize(s)

    if len(s) == 0:
        raise ValueError('Empty string')

    s = s.lower()
    result = []

    # skip first character if s starts with these
    if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
        s = s[1:]

    i = 0

    while i < len(s):
        c = s[i]
        next = s[i+1] if i < len(s)-1 else '*****'
        nextnext = s[i+2] if i < len(s)-2 else '*****'

        # skip doubles except for cc
        if c == next and c != 'c':
            i += 1
            continue

        if c in 'aeiou':
            if i == 0 or s[i-1] == ' ':
                result.append(c)
        elif c == 'b':
            if (not (i != 0 and s[i-1] == 'm')) or next:
                result.append('b')
        elif c == 'c':
            if next == 'i' and nextnext == 'a' or next == 'h':
                result.append('x')
                i += 1
            elif next in 'iey':
                result.append('s')
                i += 1
            else:
                result.append('k')
        elif c == 'd':
            if next == 'g' and nextnext in 'iey':
                result.append('j')
                i += 2
            else:
                result.append('t')
        elif c in 'fjlmnr':
            result.append(c)
        elif c == 'g':
            if next in 'iey':
                result.append('j')
            elif next not in 'hn':
                result.append('k')
            elif next == 'h' and nextnext and nextnext not in 'aeiou':
                i += 1
        elif c == 'h':
            if i == 0 or next in 'aeiou' or s[i-1] not in 'aeiou':
                result.append('h')
        elif c == 'k':
            if i == 0 or s[i-1] != 'c':
                result.append('k')
        elif c == 'p':
            if next == 'h':
                result.append('f')
                i += 1
            else:
                result.append('p')
        elif c == 'q':
            result.append('k')
        elif c == 's':
            if next == 'h':
                result.append('x')
                i += 1
            elif next == 'i' and nextnext in 'oa':
                result.append('x')
                i += 2
            else:
                result.append('s')
        elif c == 't':
            if next == 'i' and nextnext in 'oa':
                result.append('x')
            elif next == 'h':
                result.append('0')
                i += 1
            elif next != 'c' or nextnext != 'h':
                result.append('t')
        elif c == 'v':
            result.append('f')
        elif c == 'w':
            if i == 0 and next == 'h':
                i += 1
            if nextnext in 'aeiou' or nextnext == '*****':
                result.append('w')
        elif c == 'x':
            if i == 0:
                if next == 'h' or (next == 'i' and nextnext in 'oa'):
                    result.append('x')
                else:
                    result.append('s')
            else:
                result.append('k')
                result.append('s')
        elif c == 'y':
            if next in 'aeiou':
                result.append('y')
        elif c == 'z':
            result.append('s')
        elif c == ' ':
            if len(result) > 0 and result[-1] != ' ':
                result.append(' ')

        i += 1

    return ''.join(result).upper()
示例#13
0
def levenshtein_distance(s1,
                         s2,
                         insert={},
                         delete={},
                         substitute={},
                         insert_default=1,
                         delete_default=1,
                         substitute_default=1):
    """
    The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.

    Args:
        s1 (str): Sequence 1.
        s2 (str): Sequence 2.
        insert (dict(str, int), optional): Insert cost of characters. Defaults to empty dict.
        delete (dict(str, int), optional): Delete cost of characters. Defaults to empty dict.
        substitute (dict(str, dict(str, int)), optional): Substitute cost of characters. Defaults to empty dict.
        insert_default (int, optional): Default value of insert cost. Defaults to 1.
        delete_default (int, optional): Default value of delete cost. Defaults to 1.
        substitute_default (int, optional): Default value of substitute cost. Defaults to 1.

    Returns:
        int: Levenshtein Distance.

    Examples:
        >>> rltk.levenshtein_distance('ab', 'abc')
        1
        >>> rltk.levenshtein_distance('a', 'abc', insert = {'c':50},
        ... insert_default=100, delete_default=100, substitute_default=100)
        150
    """

    utils.check_for_none(s1, s2)
    utils.check_for_type(basestring, s1, s2)

    s1 = utils.unicode_normalize(s1)
    s2 = utils.unicode_normalize(s2)

    n1, n2 = len(s1), len(s2)
    if n1 == 0 and n2 == 0:
        return 0

    # if n1 == 0 or n2 == 0:
    #     return max(n1, n2)

    dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
    for i in xrange(n1 + 1):
        for j in xrange(n2 + 1):
            if i == 0 and j == 0:  # [0,0]
                continue
            elif i == 0:  # most top row
                c = s2[j - 1]
                dp[i][j] = insert[c] if c in insert else insert_default
                dp[i][j] += dp[i][j - 1]
            elif j == 0:  # most left column
                c = s1[i - 1]
                dp[i][j] = delete[c] if c in delete else delete_default
                dp[i][j] += dp[i - 1][j]
            else:
                c1, c2 = s1[i - 1], s2[j - 1]
                insert_cost = insert[c2] if c2 in insert else insert_default
                delete_cost = delete[c1] if c1 in delete else delete_default
                substitute_cost = substitute[c1][c2] \
                    if c1 in substitute and c2 in substitute[c1] else substitute_default

                if c1 == c2:
                    dp[i][j] = dp[i - 1][j - 1]
                else:
                    dp[i][j] = min(dp[i][j - 1] + insert_cost,
                                   dp[i - 1][j] + delete_cost,
                                   dp[i - 1][j - 1] + substitute_cost)
    return dp[n1][n2]