Exemplo n.º 1
0
def token_similarity(a, b):
    # Strings are a case insensitive match.
    # Match any whitespace to any whitespace.
    if a.word.lower().strip() == b.word.lower().strip():
        return 1.

    # Make it impossible for words to map to whitespace.
    if ((isspace(a.word) and not isspace(b.word))
            or (not isspace(a.word) and isspace(b.word))):
        return -1.

    # Make it impossible for words to map to punctuation.
    if ispunc(a.word) and ispunc(b.word):
        return 0.9
    if ((ispunc(a.word) and not ispunc(b.word))
            or (not ispunc(a.word) and ispunc(b.word))):
        return -1.

    # Strings sound alike (approximate phonetic match).
    if a.word.isalpha() and b.word.isalpha():
        if jf.metaphone(a.word) == jf.metaphone(b.word):
            return 0.9
        if jf.soundex(a.word) == jf.soundex(b.word):
            return 0.9
        if jf.nysiis(a.word) == jf.nysiis(b.word):
            return 0.9
        if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word):
            return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a.word, b.word)
Exemplo n.º 2
0
def augment_data(df: pd.DataFrame) -> pd.DataFrame:
    """Augment dataframe of FEBRL person data with blocking keys and cleanup for
    comparison step.

    Args:
        df: pandas dataframe containing FEBRL-generated person data

    Returns:
        Augmented dataframe.
    """

    df["surname"] = df["surname"].fillna("")
    df["first_name"] = df["first_name"].fillna("")

    # Soundex phonetic encodings.
    df["soundex_surname"] = df["surname"].apply(lambda x: jellyfish.soundex(x))
    df["soundex_firstname"] = df["first_name"].apply(
        lambda x: jellyfish.soundex(x))

    # NYSIIS phonetic encodings.
    df["nysiis_surname"] = df["surname"].apply(lambda x: jellyfish.nysiis(x))
    df["nysiis_firstname"] = df["first_name"].apply(
        lambda x: jellyfish.nysiis(x))

    # Last 3 of SSID.
    df["ssid_last3"] = df["soc_sec_id"].apply(lambda x: str(x)[-3:].zfill(3)
                                              if x else None)
    df["soc_sec_id"] = df["soc_sec_id"].astype(str)

    # DOB to date object.
    df["dob"] = df["date_of_birth"].apply(lambda x: dob_to_date(x))
Exemplo n.º 3
0
def concatWords(orSentWords, comSentWords):
    #проверяет все слова предожения на слияние подряд идущих слов
    #возращает скорретированный список слов предожения
    sentLen = len(comSentWords)
    orSentLen = len(orSentWords)
    count = 1
    for i, val in enumerate(orSentWords):
        if i < sentLen - count and (val != comSentWords[i]
                                    or val != comSentWords[i + 1]):
            ny_baseWord = nysiis(unicode(orSentWords[i])).replace("'", '')
            ny_word1 = nysiis(unicode(comSentWords[i])).replace("'", '')
            ny_word2 = nysiis(unicode(comSentWords[i + 1])).replace("'", '')
            if len(ny_baseWord) < len(ny_word1) and len(ny_baseWord) < len(
                    ny_word2):
                continue
            if i < orSentLen - 1:
                if not isTheSameWords(i, orSentWords, comSentWords):
                    comSentWords = isSumma2WordsTheBest(
                        val, comSentWords[i], comSentWords[i + 1], i,
                        comSentWords)
                    count += 1
            else:
                comSentWords = isSumma2WordsTheBest(val, comSentWords[i],
                                                    comSentWords[i + 1], i,
                                                    comSentWords)
                count += 1

    return comSentWords
Exemplo n.º 4
0
def CleanVillageNames():
    import jellyfish
    subcenters = SubCenter.objects.all()
    for subc in subcenters:
        villages = Address.objects.filter(beneficiaries__subcenter=subc).distinct()
        nl_vills = villages.filter(village_mcts_id = None) 
        l_vills = villages.exclude(village_mcts_id = None)
        phonetic_codes = []
        for l_vill in l_vills:
            phonetic_codes.append(jellyfish.nysiis(l_vill.village))
        #match the non-legitimate ones
        for nl_vill in nl_vills:
            pc = jellyfish.nysiis(nl_vill.village)
            min_dist = 100
            min_ind = 0
            ind = 0
            for spc in phonetic_codes:
                dist = jellyfish.jaro_distance(spc ,pc)
                if dist <= min_dist:
                    min_ind = ind
                    min_dist = dist
                ind +=1
            if min_dist < 1.0:
                match_vill = l_vills[min_ind]
                nl_vill.village_mcts_id = match_vill.village_mcts_id
                nl_vill.value = nl_vill.value+'_m'
                nl_vill.save()
def extract_features(word1, word2,lang1,lang2):
    features = {
        'lcsr': lcsr(word1, word2),
        'PREFIX': prefix([word1,word2]),
        'dice_coefficient': dice_coefficient(word1, word2),
        'soundex': soundex.Soundex().compare(word1,word2),
        'nysiis': lcsr(jellyfish.nysiis(word1), jellyfish.nysiis(word2)),
        'epitran': lcsr(get_translit(lang1, word1), get_translit(lang2, word2))    }
    return features
def extract_features(lang1, word1, lang2, word2):
    features = {
        'lcsr': lcsr(word1, word2),
        'PREFIX': PREFIX(word1, word2),
        'dice_coefficient': dice_coefficient(word1, word2),
        'soundex': soundex.compare(word1, word2),
        'nysiis': lcsr(nysiis(word1), nysiis(word2)),
        'epitran': lcsr(get_translit(lang1, word1), get_translit(lang2, word2))
    }
    return features
def test_jellyfish():
    text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.'
    text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi'
    lst1 = normalize(text1)
    lst2 = normalize(text2)
    text_norm1 = ' '.join(lst1)
    text_norm2 = ' '.join(lst2)
    print(jellyfish.jaro_distance(text1, text2))
    print(jellyfish.jaro_distance(text_norm1, text_norm2))
    print(jellyfish.jaro_winkler(text1, text2))
    print(jellyfish.jaro_winkler(text_norm1, text_norm2))
    print(jellyfish.nysiis(text1))
    print(jellyfish.nysiis(text2))
    exit()
def test_jellyfish():
    text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.'
    text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi'
    lst1 = normalize(text1)
    lst2 = normalize(text2)
    text_norm1 = ' '.join(lst1)
    text_norm2 = ' '.join(lst2)
    print(jellyfish.jaro_distance(text1, text2))
    print(jellyfish.jaro_distance(text_norm1, text_norm2))
    print(jellyfish.jaro_winkler(text1, text2))
    print(jellyfish.jaro_winkler(text_norm1, text_norm2))
    print(jellyfish.nysiis(text1))
    print(jellyfish.nysiis(text2))
    exit()
Exemplo n.º 9
0
def phonetic(s, method):
    """
    Phonetically encode the values in the Series. 

    :param method: The algorithm that is used to phonetically encode the values. The possible options are 'soundex' (`wikipedia <https://en.wikipedia.org/wiki/Soundex>`_) and 'nysiis' (`wikipedia <https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System>`_). 
    :type method: str

    :return: A Series with phonetic encoded values.
    :rtype: pandas.Series

    .. note::

        The 'soundex' and 'nysiis' algorithms use the package 'jellyfish'. 
        It can be installed with pip (``pip install jellyfish``).

    """


    try:
        import jellyfish
    except ImportError:
        print ("Install jellyfish to use string encoding.")

    s = clean(s, replace_by_none='[^\-\_A-Za-z0-9]+')
 
    if method == 'soundex':
        return s.str.upper().apply(lambda x: jellyfish.soundex(x) if pandas.notnull(x) else np.nan)

    elif method == 'nysiis':
        return s.str.upper().apply(lambda x: jellyfish.nysiis(x) if pandas.notnull(x) else np.nan)

    else:
        raise Exception("Phonetic encoding method not found")
Exemplo n.º 10
0
def fuzzy(string):
    return jsonify({
        "metaphone": jellyfish.metaphone(string),
        "soundex": jellyfish.soundex(string),
        "nysiis": jellyfish.nysiis(string),
        "match_rating_codex": jellyfish.match_rating_codex(string),
    })
Exemplo n.º 11
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
Exemplo n.º 12
0
def _word_similarity_score(a, b):
    if a == b:
        return 1.

    # Case and whitespace insenstive comparison.
    if a.lower().strip() == b.lower().strip():
        return 0.95

    # Penalize whitespace matching to non-whitespace.
    if ((_isspace(a) and not _isspace(b)) or
        (not _isspace(a) and _isspace(b))):
        return 0

    # Exceptions to punctuation.
    if _match_ampersand(a, b):
        return 0.85
    # Penalize punctuation matching to non-punctuation.
    if _ispunc(a) and _ispunc(b):
        return 0.95
    if ((_ispunc(a) and not _ispunc(b)) or
        (not _ispunc(a) and _ispunc(b))):
        return 0

    # Problems with phonetic match functions segfaulting on
    # empty strings. Also beneficial to match strings with
    # no alpha characters to each other (e.g., line numbers).
    a_alpha = u''.join([ c for c in a if c.isalpha() ])
    b_alpha = u''.join([ c for c in b if c.isalpha() ])
    if a_alpha == '' and b_alpha == '':
        return 0.85

    # Strings sound alike (approximate phonetic match).
    if jf.match_rating_comparison(a_alpha, b_alpha):
        return 0.9
    if jf.metaphone(a_alpha) == jf.metaphone(b_alpha):
        return 0.9
    if jf.soundex(a_alpha) == jf.soundex(b_alpha):
        return 0.9
    if jf.nysiis(a_alpha) == jf.nysiis(b_alpha):
        return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a, b)
Exemplo n.º 13
0
    def test_nysiis(self):
        cases = [("Worthy", "WARTY"),
                 ("Ogata", "OGAT"),
                 ("montgomery", "MANTGANARY"),
                 ("Costales", "CASTAL"),
                 ("Tu", "T"),
                 ]

        for (s1, s2) in cases:
            self.assertEqual(jellyfish.nysiis(s1), s2)
Exemplo n.º 14
0
def measure_string_distance(s1, s2, method):
    '''
            Four methods will be used with method code from 1 to 4
            Two methods focused on string similarity and the other two will be focused on phonetic encoding
            Method code to method name:
            1. jaro-winkler distance
            2. damerau-levenshtein distance
            3. Metaphone
            4. NYSIIS
            5. match_rating_codex

            note:
                    for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match)
                    for methods 1 and 2, the methods will return a value in range [0, 1]
    '''
    result = 0

    if s1 == '' or s2 == '':
        return result

    if method == 1:
        result = jellyfish.jaro_winkler(s1, s2)
    elif method == 2:
        try:
            diff = jellyfish.damerau_levenshtein_distance(s1, s2)
            result = 1 - (diff / max(len(s1), len(s2)))
        except:
            result = 0
    elif method == 3:
        result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0
    elif method == 4:
        result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0
    elif method == 5:
        result = 1 if jellyfish.match_rating_codex(
            s1) == jellyfish.match_rating_codex(s2) else 0
    # elif method == 0:
    # 	raise ValueError("provide a method code (1-6).")
    # else:
    # 	raise ValueError("the method parameter must be in the range from 1 to 6.")

    return result
Exemplo n.º 15
0
def wordsRightOrder(maxSentWds, sentWds):
    '''
    коррекция порядка слов в предложении
    сравнение предложения по отношению к самому длинному и вставка одного '' вместо пропущенного слова
    :param maxSent: string
    :param sent: string
    :return: sentWds: list
    '''
    msl = len(maxSentWds)
    sl = len(sentWds)

    if msl - sl > 0:
        for i in range(msl):
            if i + 1 < msl and i < len(sentWds):
                if sentWds[i] == maxSentWds[i]:
                    continue
                elif levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i]))) >\
                        levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i+1]))):
                    sentWds.insert(i,'')

    return sentWds
Exemplo n.º 16
0
def wordsRightOrder(maxSentWds, sentWds):
    '''
    коррекция порядка слов в предложении
    сравнение предложения по отношению к самому длинному и вставка одного '' вместо пропущенного слова
    :param maxSent: string
    :param sent: string
    :return: sentWds: list
    '''
    msl = len(maxSentWds)
    sl = len(sentWds)

    if msl - sl > 0:
        for i in range(msl):
            if i + 1 < msl and i < len(sentWds):
                if sentWds[i] == maxSentWds[i]:
                    continue
                elif levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i]))) >\
                        levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i+1]))):
                    sentWds.insert(i, '')

    return sentWds
Exemplo n.º 17
0
def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['TM_A'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1)
    df['TM_B'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1)

    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1)
    
    # Jellyfish levenshtein
    df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1)
    # Scale Levenshtein column
    scaler = MinMaxScaler()
    df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1))

    # Jellyfish phoneme
    df['metaphone'] = df.apply(
        lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1)
    df['nysiis'] = df.apply(
        lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1)
    df['mtch_rtng_cdx'] = df.apply(
        lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1)
    
    return df
Exemplo n.º 18
0
def get_hash(word, hash_type):
    if hash_type == "SOUNDEX":
        hash = jellyfish.soundex(word)
    elif hash_type == "NYSIIS":
        hash = jellyfish.nysiis(word)
    elif hash_type == "MRA":
        hash = jellyfish.match_rating_codex(word)
    elif hash_type == "METAPHONE":
        hash = jellyfish.metaphone(word)
    else:
        raise NotImplementedError(
            "approach '{}' not implemented".format(hash_type))
    return hash
Exemplo n.º 19
0
    def correct(self, wrongWord):
        candidates = []
        candidateDistList = []
        wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram)

        for trigram in wWTGrams:
            if trigram in SpellChecker.invertTriMap:
                candidates = candidates + SpellChecker.invertTriMap[trigram]
        candidates = list(set(candidates))
        #print (len(candidates))

        for candidate in candidates:
            if abs(len(candidate) - len(wrongWord)) > 2:
                continue
            if wrongWord == candidate:
                continue
            ed = self.compED(candidate, wrongWord)
            jd = jellyfish.jaro_distance(wrongWord, candidate)
            gd = self.getJackSim(
                self.getGrams(candidate, SpellChecker.jackardGram),
                self.getGrams(wrongWord, SpellChecker.jackardGram))
            score = gd * SpellChecker.dictCountMap[
                candidate] / SpellChecker.totalCount * (1 /
                                                        (ed + 1)) * (1 /
                                                                     (jd + 1))
            if jellyfish.metaphone(wrongWord) == jellyfish.metaphone(
                    candidate):
                score = score + 0.1
            if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate):
                score = score + 0.1
            if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate):
                score = score + 0.1
            if jellyfish.match_rating_codex(
                    wrongWord) == jellyfish.match_rating_codex(candidate):
                score = score + 0.1
            tmpCandidate = ScoreRcd(candidate, ed, score)
            candidateDistList.append(tmpCandidate)
        candidateDistList.sort()
        return candidateDistList
Exemplo n.º 20
0
def concatWords(orSentWords, comSentWords):
    #проверяет все слова предожения на слияние подряд идущих слов
    #возращает скорретированный список слов предожения
    sentLen = len(comSentWords)
    orSentLen = len(orSentWords)
    count = 1
    for i, val in enumerate(orSentWords):
        if i < sentLen - count and (val != comSentWords[i] or val != comSentWords[i+1]):
            ny_baseWord = nysiis(unicode(orSentWords[i])).replace("'",'')
            ny_word1 = nysiis(unicode(comSentWords[i])).replace("'",'')
            ny_word2 = nysiis(unicode(comSentWords[i+1])).replace("'",'')
            if  len(ny_baseWord) < len(ny_word1) and len(ny_baseWord) < len(ny_word2):
                continue
            if i < orSentLen - 1:
                if not isTheSameWords(i, orSentWords, comSentWords):
                    comSentWords = isSumma2WordsTheBest(val, comSentWords[i], comSentWords[i+1], i, comSentWords)
                    count += 1
            else:
                comSentWords = isSumma2WordsTheBest(val, comSentWords[i], comSentWords[i+1], i, comSentWords)
                count += 1

    return comSentWords
Exemplo n.º 21
0
def phonetic_similarity(word1, word2):

    encoding_1 = {}
    encoding_2 = {}
    algorithm_similarity_score = {}
    cumulative_score = 0

    encoding_1['metaphone'] = jellyfish.metaphone(word1)
    encoding_1['nysiis'] = jellyfish.nysiis(word1)
    encoding_1['soundex'] = jellyfish.soundex(word1)
    encoding_1['match_rating_codex'] = jellyfish.match_rating_codex(word1)

    encoding_2['metaphone'] = jellyfish.metaphone(word2)
    encoding_2['nysiis'] = jellyfish.nysiis(word2)
    encoding_2['soundex'] = jellyfish.soundex(word2)
    encoding_2['match_rating_codex'] = jellyfish.match_rating_codex(word2)

    for algorithm in encoding_1.keys():
        algorithm_similarity_score[algorithm] = jellyfish.levenshtein_distance(
            encoding_1[algorithm],
            encoding_2[algorithm]) * weightage[algorithm]
        cumulative_score += algorithm_similarity_score[algorithm]

    return cumulative_score
Exemplo n.º 22
0
def compare(word1, dictionary):
    c1_1 = jellyfish.soundex(word1)
    c2_1 = jellyfish.metaphone(word1)
    c3_1 = jellyfish.nysiis(word1)
    c4_1 = jellyfish.match_rating_codex(word1)

    result = (0, None)

    for word2 in dictionary:
        c1_2 = jellyfish.soundex(word2)
        c2_2 = jellyfish.metaphone(word2)
        c3_2 = jellyfish.nysiis(word2)
        c4_2 = jellyfish.match_rating_codex(word2)
        c1 = levenshtein(c1_1, c1_2)
        c2 = levenshtein(c2_1, c2_2)
        c3 = levenshtein(c3_1, c3_2)
        c4 = levenshtein(c4_1, c4_2)

        sim = c1 * 0.2 + c2 * 0.3 + c3 * 0.3 + c4 * 0.2

        if sim > result[0]:
            result = (sim, word2)

    return result
Exemplo n.º 23
0
def nysiis():
    tokens = [
        'Ball Bearing', 'bll brng', 'Centrifugal', 'centrifigal', 'PUmp', 'pmp'
    ]

    print('Running NYSIIS...')

    # print tokens
    print('Tokens: ', end='')
    for i in tokens:
        print(i, ' | ', end='')

    # printcodes
    print('\n', end="")
    print('Codes: ', end='')
    for i in tokens:
        print(jellyfish.nysiis(i), ' | ', end='')
Exemplo n.º 24
0
# pos = {city:(long, lat) for (city, (lat,long)) in nx.get_node_attributes(G, 'pos').items()}
# nx.draw(G, pos, with_labels=True, node_size=0)

# ---------------------------------------------> jellyfish <-------------------------------------------- #

# String comparison
grape_1 = 'Ma'
grape_2 = 'Mariette'
jf.levenshtein_distance(grape_1, grape_2)
jf.jaro_distance(grape_1, grape_2)
jf.damerau_levenshtein_distance(grape_1, grape_2)

# Phonetic encoding
jf.metaphone(grape_1)
jf.soundex(grape_1)
jf.nysiis(grape_1)
jf.match_rating_codex(grape_1)
jf.match_rating_codex(grape_2)

# ---------------------------------------------> Udacity <-------------------------------------------- #

scores = [3.0, 1.0, 0.2]

scores2 = np.array([[1, 2, 3, 6], [2, 4, 5, 6], [3, 8, 7, 6]])


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

Exemplo n.º 25
0
		ed = compED(candidate,wrongWord)
		if abs(len(candidate)- len(wrongWord)) > 2:
			continue
		#if ed ==0:
		#	ed =1
		jd=jellyfish.jaro_distance(wrongWord,candidate)
		#if jd==0:
		#	jd =1
		gd = getJackSim(getGrams(candidate,jackardGram),getGrams(wrongWord,jackardGram))
		score = gd * dictCountMap[candidate]/totalCount * (1/(ed+1)) * (1/(jd+1))
		#New Code
		if jellyfish.metaphone(wrongWord) == jellyfish.metaphone(candidate):
			score = score+0.1
		if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate):
			score = score+0.1
		if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate):
			score = score+0.1
		if jellyfish.match_rating_codex(wrongWord) == jellyfish.match_rating_codex(candidate):
			score = score+0.1
		
		tmpCandidate = ScoreRcd(candidate,ed, score) ;
		candidateDistList.append(tmpCandidate)
	candidateDistList.sort()

	maxIter = 10
	if len(candidateDistList) < maxIter:
		maxIter = len(candidateDistList)

	for i in range(0,maxIter):
		out =  out + candidateDistList[i].getScore() + ' '
	print (out)
Exemplo n.º 26
0
def getInputAndSuggestPerWord(uname, pokemons_names_list, pokemon_url_mapping, print_output = True):
  pokemons_names_list, pokemon_url_mapping = get_pokemons_names()
  best_rep = {}
  for pokemon_name in pokemons_names_list:
    # if not pokemon_name.startswith("cascoon"):
    if len(pokemon_name) < len(uname) + 2:
      continue

    # Finding the pokemon names matching the user name

    # getting substrings of pokemon name
    if len(uname) > 5:
      psubs = get_all_substrings(pokemon_name, 3 + int(len(uname)/8), len(uname) + 2)
    else:
      psubs = get_all_substrings(pokemon_name, 2, len(uname) + 2)
    similar_subs = []
    best_sub_rep = {}
    for psub in psubs:
      psub_phone = jellyfish.nysiis(psub)
      # getting substing of user name to compare with substrings of pokemon names
      usubs = get_all_substrings(uname, int(len(uname) * 0.75), len(uname), True)
      for usub in usubs:
        name_diff = getDiff(psub, usub) # getting string diff
        uphonic = jellyfish.nysiis(usub)
        phone_diff = getDiff(psub_phone, uphonic) # getting phonic diff
        best_sub_rep[name_diff + phone_diff] = [psub, pokemon_name, name_diff, phone_diff, usub]
      # print("psub : ", psub, " psub phone : ", psub_phone, " uname : ", uname, " uphonic : ", uphonic)
      # print("jerro wicker distance names : ", name_diff)
      # print("jerro wicker distance phone : ", phone_diff)

    list_keys = list(best_sub_rep.keys())
    list_keys = sorted(list_keys, reverse=True)[:5]
    for key in list_keys:
      if key > 1.35: # Threshold match of phonic and text diff
        if key in best_rep:
          best_rep[key].append(best_sub_rep[key])
        else:
          best_rep[key] = [best_sub_rep[key]]

  # Getting final best pokemon names matching, and using user's name
  list_keys = list(best_rep.keys())
  list_keys = sorted(list_keys, reverse=True)
  output_res = {}
  for list_key in list_keys:
    for rep in best_rep[list_key]:
      pokemonified_name = rep[1].replace(rep[0], rep[4])
      # this is done to avoid results matching the pokmon name exactly,
      #   to take the longest string, and only one username for each pokemon
      if pokemonified_name != rep[1]:
        if (rep[1] not in output_res) or (len(pokemonified_name) > len(output_res[rep[1]]['updated_name'])):
          output_res[rep[1]] = {'updated_name': pokemonified_name.capitalize(),
            'url': pokemon_url_mapping[rep[1]],
            'pokemon_name': rep[1].capitalize(), 'similarity': rep[3]}
    if len(list(output_res.keys())) > 5: # break if we have more than 5 results
      break

  # Now, no need of pokemon name hashing.
  output_res = list(output_res.values())

  # based on similarity, sorting the values to get most relevent result on top
  output_res = sorted(output_res, key=itemgetter('similarity'), reverse = True)[:6]
  if print_output:
    print("Our best suggestions results ::: ")
    for res in output_res:
      print("Nikname : ", res['updated_name'], " Based on pokemon : ",
        res['pokemon_name'], "  rep : ", res)
    print()
  return output_res

# while(1):
# uname = input("Enter username : ")
# getInputAndSuggest(uname)
Exemplo n.º 27
0
import sys
import jellyfish

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Please provide two words as arguments.")
        exit()

    word1 = sys.argv[1]
    word2 = sys.argv[2]

    print(word1, word2)
    print("Edit distance: {0}".format(
        jellyfish.levenshtein_distance(word1, word2)))
    print("Phonetic Encodings")
    print("{0}: {1}".format(word1, jellyfish.nysiis(word1)))
    print("{0}: {1}".format(word2, jellyfish.nysiis(word2)))
Exemplo n.º 28
0
def main():
    # declare test strings
    # rem: u prefix is required jellyfish convention
    str1 = u'Jellyfish' 
    str2= u'Smellyfish'
    
    
    # test Phonetic Encoding
    print('\nPhonetic Encoding ----------------------------')
    
    # Metaphone
    r1 = jellyfish.metaphone(str1)
    r2 = jellyfish.metaphone(str2)
    print('Metaphone: ', r1, ", ", r2)
    
    # American Soundex
    r1 = jellyfish.soundex(str1)
    r2 = jellyfish.soundex(str2)
    print('Soundex: ', r1, ", ", r2)
    
    # NYSIIS
    r1 = jellyfish.nysiis(str1)
    r2 = jellyfish.nysiis(str2)
    print('NYSIIS: ', r1, ", ", r2)

    # Match Rating Codex    
    r1 = jellyfish.match_rating_codex(str1)
    r2 = jellyfish.match_rating_codex(str2)
    print('Match Rating Codex: ', r1, ", ", r2)
    
    
    # test Stemming
    print('\nStemming -------------------------------------')
    pStr1 = u'Jellyfished'
    pStr2 = u'Smellyfishing'
    r1 = jellyfish.porter_stem(str1)
    r2 = jellyfish.porter_stem(str2)
    print('Porter Stemmer: ', r1, ", ", r2)
    
    
    # test String Comparison
    print('\nString Comparisons ---------------------------')
    
    # Levenshtein Distance
    r = jellyfish.levenshtein_distance(str1, str2)
    print('Levenshtein Distance: ', r)

    # Damerau-Levenshtein Distance
    r = jellyfish.damerau_levenshtein_distance(str1, str2)
    print('Damerau-Levenshtein Distance: ', r)
    
    # Hamming Distance
    result = jellyfish.hamming_distance(str1, str2)
    print('Hamming Distance: ', r)

    # Jaro Distance
    result = jellyfish.jaro_distance(str1, str2)
    print('Jaro Distance: ', r)
    
    # Jaro-Winkler Distance
    result = jellyfish.jaro_winkler(str1, str2)
    print('Jaro-Winkler Distance: ', r)
    
    # Match Rating Approach (comparison)
    r = jellyfish.match_rating_comparison(str1, str2)
    print('Match Rating Comparison: ', r)
     
        
    # end program
    print('Done.')
import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')
#'JLLFSH'
import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')
Exemplo n.º 30
0
def nysiis(s):
    return None if s == None else J.nysiis(s)
Exemplo n.º 31
0
 def apply(self, s):
     return s.apply(lambda x: nysiis(x) if pd.notnull(x) else np.nan)
Exemplo n.º 32
0
 def transform(self, data):
     if isinstance(data, basestring):
         return nysiis(unicode(data))
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
print(Levenshtein.jaro_winkler('hello', 'helol'))  # 0.9533333333333333; 计算 Jaro – Winkler 距离
Exemplo n.º 34
0
def nysiis(s):
    return jellyfish.nysiis(s)