def token_similarity(a, b): # Strings are a case insensitive match. # Match any whitespace to any whitespace. if a.word.lower().strip() == b.word.lower().strip(): return 1. # Make it impossible for words to map to whitespace. if ((isspace(a.word) and not isspace(b.word)) or (not isspace(a.word) and isspace(b.word))): return -1. # Make it impossible for words to map to punctuation. if ispunc(a.word) and ispunc(b.word): return 0.9 if ((ispunc(a.word) and not ispunc(b.word)) or (not ispunc(a.word) and ispunc(b.word))): return -1. # Strings sound alike (approximate phonetic match). if a.word.isalpha() and b.word.isalpha(): if jf.metaphone(a.word) == jf.metaphone(b.word): return 0.9 if jf.soundex(a.word) == jf.soundex(b.word): return 0.9 if jf.nysiis(a.word) == jf.nysiis(b.word): return 0.9 if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a.word, b.word)
def augment_data(df: pd.DataFrame) -> pd.DataFrame: """Augment dataframe of FEBRL person data with blocking keys and cleanup for comparison step. Args: df: pandas dataframe containing FEBRL-generated person data Returns: Augmented dataframe. """ df["surname"] = df["surname"].fillna("") df["first_name"] = df["first_name"].fillna("") # Soundex phonetic encodings. df["soundex_surname"] = df["surname"].apply(lambda x: jellyfish.soundex(x)) df["soundex_firstname"] = df["first_name"].apply( lambda x: jellyfish.soundex(x)) # NYSIIS phonetic encodings. df["nysiis_surname"] = df["surname"].apply(lambda x: jellyfish.nysiis(x)) df["nysiis_firstname"] = df["first_name"].apply( lambda x: jellyfish.nysiis(x)) # Last 3 of SSID. df["ssid_last3"] = df["soc_sec_id"].apply(lambda x: str(x)[-3:].zfill(3) if x else None) df["soc_sec_id"] = df["soc_sec_id"].astype(str) # DOB to date object. df["dob"] = df["date_of_birth"].apply(lambda x: dob_to_date(x))
def concatWords(orSentWords, comSentWords): #проверяет все слова предожения на слияние подряд идущих слов #возращает скорретированный список слов предожения sentLen = len(comSentWords) orSentLen = len(orSentWords) count = 1 for i, val in enumerate(orSentWords): if i < sentLen - count and (val != comSentWords[i] or val != comSentWords[i + 1]): ny_baseWord = nysiis(unicode(orSentWords[i])).replace("'", '') ny_word1 = nysiis(unicode(comSentWords[i])).replace("'", '') ny_word2 = nysiis(unicode(comSentWords[i + 1])).replace("'", '') if len(ny_baseWord) < len(ny_word1) and len(ny_baseWord) < len( ny_word2): continue if i < orSentLen - 1: if not isTheSameWords(i, orSentWords, comSentWords): comSentWords = isSumma2WordsTheBest( val, comSentWords[i], comSentWords[i + 1], i, comSentWords) count += 1 else: comSentWords = isSumma2WordsTheBest(val, comSentWords[i], comSentWords[i + 1], i, comSentWords) count += 1 return comSentWords
def CleanVillageNames(): import jellyfish subcenters = SubCenter.objects.all() for subc in subcenters: villages = Address.objects.filter(beneficiaries__subcenter=subc).distinct() nl_vills = villages.filter(village_mcts_id = None) l_vills = villages.exclude(village_mcts_id = None) phonetic_codes = [] for l_vill in l_vills: phonetic_codes.append(jellyfish.nysiis(l_vill.village)) #match the non-legitimate ones for nl_vill in nl_vills: pc = jellyfish.nysiis(nl_vill.village) min_dist = 100 min_ind = 0 ind = 0 for spc in phonetic_codes: dist = jellyfish.jaro_distance(spc ,pc) if dist <= min_dist: min_ind = ind min_dist = dist ind +=1 if min_dist < 1.0: match_vill = l_vills[min_ind] nl_vill.village_mcts_id = match_vill.village_mcts_id nl_vill.value = nl_vill.value+'_m' nl_vill.save()
def extract_features(word1, word2,lang1,lang2): features = { 'lcsr': lcsr(word1, word2), 'PREFIX': prefix([word1,word2]), 'dice_coefficient': dice_coefficient(word1, word2), 'soundex': soundex.Soundex().compare(word1,word2), 'nysiis': lcsr(jellyfish.nysiis(word1), jellyfish.nysiis(word2)), 'epitran': lcsr(get_translit(lang1, word1), get_translit(lang2, word2)) } return features
def extract_features(lang1, word1, lang2, word2): features = { 'lcsr': lcsr(word1, word2), 'PREFIX': PREFIX(word1, word2), 'dice_coefficient': dice_coefficient(word1, word2), 'soundex': soundex.compare(word1, word2), 'nysiis': lcsr(nysiis(word1), nysiis(word2)), 'epitran': lcsr(get_translit(lang1, word1), get_translit(lang2, word2)) } return features
def test_jellyfish(): text1 = 'Телефон в хорошем состоянии, трещин и сколов нет, за все время менялся только аккумулятор(поэтому заряд держит хорошо), остальное все родное, в целом работает отлично! В комплекте кабель. Обмен не интересен.' text2 = 'Продам телефон в хорошем состоянии Полностью рабочий есть WiFi' lst1 = normalize(text1) lst2 = normalize(text2) text_norm1 = ' '.join(lst1) text_norm2 = ' '.join(lst2) print(jellyfish.jaro_distance(text1, text2)) print(jellyfish.jaro_distance(text_norm1, text_norm2)) print(jellyfish.jaro_winkler(text1, text2)) print(jellyfish.jaro_winkler(text_norm1, text_norm2)) print(jellyfish.nysiis(text1)) print(jellyfish.nysiis(text2)) exit()
def phonetic(s, method): """ Phonetically encode the values in the Series. :param method: The algorithm that is used to phonetically encode the values. The possible options are 'soundex' (`wikipedia <https://en.wikipedia.org/wiki/Soundex>`_) and 'nysiis' (`wikipedia <https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System>`_). :type method: str :return: A Series with phonetic encoded values. :rtype: pandas.Series .. note:: The 'soundex' and 'nysiis' algorithms use the package 'jellyfish'. It can be installed with pip (``pip install jellyfish``). """ try: import jellyfish except ImportError: print ("Install jellyfish to use string encoding.") s = clean(s, replace_by_none='[^\-\_A-Za-z0-9]+') if method == 'soundex': return s.str.upper().apply(lambda x: jellyfish.soundex(x) if pandas.notnull(x) else np.nan) elif method == 'nysiis': return s.str.upper().apply(lambda x: jellyfish.nysiis(x) if pandas.notnull(x) else np.nan) else: raise Exception("Phonetic encoding method not found")
def fuzzy(string): return jsonify({ "metaphone": jellyfish.metaphone(string), "soundex": jellyfish.soundex(string), "nysiis": jellyfish.nysiis(string), "match_rating_codex": jellyfish.match_rating_codex(string), })
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def _word_similarity_score(a, b): if a == b: return 1. # Case and whitespace insenstive comparison. if a.lower().strip() == b.lower().strip(): return 0.95 # Penalize whitespace matching to non-whitespace. if ((_isspace(a) and not _isspace(b)) or (not _isspace(a) and _isspace(b))): return 0 # Exceptions to punctuation. if _match_ampersand(a, b): return 0.85 # Penalize punctuation matching to non-punctuation. if _ispunc(a) and _ispunc(b): return 0.95 if ((_ispunc(a) and not _ispunc(b)) or (not _ispunc(a) and _ispunc(b))): return 0 # Problems with phonetic match functions segfaulting on # empty strings. Also beneficial to match strings with # no alpha characters to each other (e.g., line numbers). a_alpha = u''.join([ c for c in a if c.isalpha() ]) b_alpha = u''.join([ c for c in b if c.isalpha() ]) if a_alpha == '' and b_alpha == '': return 0.85 # Strings sound alike (approximate phonetic match). if jf.match_rating_comparison(a_alpha, b_alpha): return 0.9 if jf.metaphone(a_alpha) == jf.metaphone(b_alpha): return 0.9 if jf.soundex(a_alpha) == jf.soundex(b_alpha): return 0.9 if jf.nysiis(a_alpha) == jf.nysiis(b_alpha): return 0.9 # Use scaled Jaro-Winkler distance. return jf.jaro_winkler(a, b)
def test_nysiis(self): cases = [("Worthy", "WARTY"), ("Ogata", "OGAT"), ("montgomery", "MANTGANARY"), ("Costales", "CASTAL"), ("Tu", "T"), ] for (s1, s2) in cases: self.assertEqual(jellyfish.nysiis(s1), s2)
def measure_string_distance(s1, s2, method): ''' Four methods will be used with method code from 1 to 4 Two methods focused on string similarity and the other two will be focused on phonetic encoding Method code to method name: 1. jaro-winkler distance 2. damerau-levenshtein distance 3. Metaphone 4. NYSIIS 5. match_rating_codex note: for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match) for methods 1 and 2, the methods will return a value in range [0, 1] ''' result = 0 if s1 == '' or s2 == '': return result if method == 1: result = jellyfish.jaro_winkler(s1, s2) elif method == 2: try: diff = jellyfish.damerau_levenshtein_distance(s1, s2) result = 1 - (diff / max(len(s1), len(s2))) except: result = 0 elif method == 3: result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0 elif method == 4: result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0 elif method == 5: result = 1 if jellyfish.match_rating_codex( s1) == jellyfish.match_rating_codex(s2) else 0 # elif method == 0: # raise ValueError("provide a method code (1-6).") # else: # raise ValueError("the method parameter must be in the range from 1 to 6.") return result
def wordsRightOrder(maxSentWds, sentWds): ''' коррекция порядка слов в предложении сравнение предложения по отношению к самому длинному и вставка одного '' вместо пропущенного слова :param maxSent: string :param sent: string :return: sentWds: list ''' msl = len(maxSentWds) sl = len(sentWds) if msl - sl > 0: for i in range(msl): if i + 1 < msl and i < len(sentWds): if sentWds[i] == maxSentWds[i]: continue elif levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i]))) >\ levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i+1]))): sentWds.insert(i,'') return sentWds
def wordsRightOrder(maxSentWds, sentWds): ''' коррекция порядка слов в предложении сравнение предложения по отношению к самому длинному и вставка одного '' вместо пропущенного слова :param maxSent: string :param sent: string :return: sentWds: list ''' msl = len(maxSentWds) sl = len(sentWds) if msl - sl > 0: for i in range(msl): if i + 1 < msl and i < len(sentWds): if sentWds[i] == maxSentWds[i]: continue elif levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i]))) >\ levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i+1]))): sentWds.insert(i, '') return sentWds
def featurize(df): if len(df.columns)==3: df.columns=['a', 'b', 'target'] elif len(df.columns)==2: df.columns=['a', 'b'] else: df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' }) df['TM_A'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1) df['TM_B'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1) df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1) df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1) # Jellyfish levenshtein df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1) # Scale Levenshtein column scaler = MinMaxScaler() df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1)) # Jellyfish phoneme df['metaphone'] = df.apply( lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1) df['nysiis'] = df.apply( lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1) df['mtch_rtng_cdx'] = df.apply( lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1) df['pshp_soundex_first'] = df.apply( lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1) for i, algo in enumerate(algos): df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1) return df
def get_hash(word, hash_type): if hash_type == "SOUNDEX": hash = jellyfish.soundex(word) elif hash_type == "NYSIIS": hash = jellyfish.nysiis(word) elif hash_type == "MRA": hash = jellyfish.match_rating_codex(word) elif hash_type == "METAPHONE": hash = jellyfish.metaphone(word) else: raise NotImplementedError( "approach '{}' not implemented".format(hash_type)) return hash
def correct(self, wrongWord): candidates = [] candidateDistList = [] wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram) for trigram in wWTGrams: if trigram in SpellChecker.invertTriMap: candidates = candidates + SpellChecker.invertTriMap[trigram] candidates = list(set(candidates)) #print (len(candidates)) for candidate in candidates: if abs(len(candidate) - len(wrongWord)) > 2: continue if wrongWord == candidate: continue ed = self.compED(candidate, wrongWord) jd = jellyfish.jaro_distance(wrongWord, candidate) gd = self.getJackSim( self.getGrams(candidate, SpellChecker.jackardGram), self.getGrams(wrongWord, SpellChecker.jackardGram)) score = gd * SpellChecker.dictCountMap[ candidate] / SpellChecker.totalCount * (1 / (ed + 1)) * (1 / (jd + 1)) if jellyfish.metaphone(wrongWord) == jellyfish.metaphone( candidate): score = score + 0.1 if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate): score = score + 0.1 if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate): score = score + 0.1 if jellyfish.match_rating_codex( wrongWord) == jellyfish.match_rating_codex(candidate): score = score + 0.1 tmpCandidate = ScoreRcd(candidate, ed, score) candidateDistList.append(tmpCandidate) candidateDistList.sort() return candidateDistList
def concatWords(orSentWords, comSentWords): #проверяет все слова предожения на слияние подряд идущих слов #возращает скорретированный список слов предожения sentLen = len(comSentWords) orSentLen = len(orSentWords) count = 1 for i, val in enumerate(orSentWords): if i < sentLen - count and (val != comSentWords[i] or val != comSentWords[i+1]): ny_baseWord = nysiis(unicode(orSentWords[i])).replace("'",'') ny_word1 = nysiis(unicode(comSentWords[i])).replace("'",'') ny_word2 = nysiis(unicode(comSentWords[i+1])).replace("'",'') if len(ny_baseWord) < len(ny_word1) and len(ny_baseWord) < len(ny_word2): continue if i < orSentLen - 1: if not isTheSameWords(i, orSentWords, comSentWords): comSentWords = isSumma2WordsTheBest(val, comSentWords[i], comSentWords[i+1], i, comSentWords) count += 1 else: comSentWords = isSumma2WordsTheBest(val, comSentWords[i], comSentWords[i+1], i, comSentWords) count += 1 return comSentWords
def phonetic_similarity(word1, word2): encoding_1 = {} encoding_2 = {} algorithm_similarity_score = {} cumulative_score = 0 encoding_1['metaphone'] = jellyfish.metaphone(word1) encoding_1['nysiis'] = jellyfish.nysiis(word1) encoding_1['soundex'] = jellyfish.soundex(word1) encoding_1['match_rating_codex'] = jellyfish.match_rating_codex(word1) encoding_2['metaphone'] = jellyfish.metaphone(word2) encoding_2['nysiis'] = jellyfish.nysiis(word2) encoding_2['soundex'] = jellyfish.soundex(word2) encoding_2['match_rating_codex'] = jellyfish.match_rating_codex(word2) for algorithm in encoding_1.keys(): algorithm_similarity_score[algorithm] = jellyfish.levenshtein_distance( encoding_1[algorithm], encoding_2[algorithm]) * weightage[algorithm] cumulative_score += algorithm_similarity_score[algorithm] return cumulative_score
def compare(word1, dictionary): c1_1 = jellyfish.soundex(word1) c2_1 = jellyfish.metaphone(word1) c3_1 = jellyfish.nysiis(word1) c4_1 = jellyfish.match_rating_codex(word1) result = (0, None) for word2 in dictionary: c1_2 = jellyfish.soundex(word2) c2_2 = jellyfish.metaphone(word2) c3_2 = jellyfish.nysiis(word2) c4_2 = jellyfish.match_rating_codex(word2) c1 = levenshtein(c1_1, c1_2) c2 = levenshtein(c2_1, c2_2) c3 = levenshtein(c3_1, c3_2) c4 = levenshtein(c4_1, c4_2) sim = c1 * 0.2 + c2 * 0.3 + c3 * 0.3 + c4 * 0.2 if sim > result[0]: result = (sim, word2) return result
def nysiis(): tokens = [ 'Ball Bearing', 'bll brng', 'Centrifugal', 'centrifigal', 'PUmp', 'pmp' ] print('Running NYSIIS...') # print tokens print('Tokens: ', end='') for i in tokens: print(i, ' | ', end='') # printcodes print('\n', end="") print('Codes: ', end='') for i in tokens: print(jellyfish.nysiis(i), ' | ', end='')
# pos = {city:(long, lat) for (city, (lat,long)) in nx.get_node_attributes(G, 'pos').items()} # nx.draw(G, pos, with_labels=True, node_size=0) # ---------------------------------------------> jellyfish <-------------------------------------------- # # String comparison grape_1 = 'Ma' grape_2 = 'Mariette' jf.levenshtein_distance(grape_1, grape_2) jf.jaro_distance(grape_1, grape_2) jf.damerau_levenshtein_distance(grape_1, grape_2) # Phonetic encoding jf.metaphone(grape_1) jf.soundex(grape_1) jf.nysiis(grape_1) jf.match_rating_codex(grape_1) jf.match_rating_codex(grape_2) # ---------------------------------------------> Udacity <-------------------------------------------- # scores = [3.0, 1.0, 0.2] scores2 = np.array([[1, 2, 3, 6], [2, 4, 5, 6], [3, 8, 7, 6]]) def softmax(x): """Compute softmax values for each sets of scores in x.""" return np.exp(x) / np.sum(np.exp(x), axis=0)
ed = compED(candidate,wrongWord) if abs(len(candidate)- len(wrongWord)) > 2: continue #if ed ==0: # ed =1 jd=jellyfish.jaro_distance(wrongWord,candidate) #if jd==0: # jd =1 gd = getJackSim(getGrams(candidate,jackardGram),getGrams(wrongWord,jackardGram)) score = gd * dictCountMap[candidate]/totalCount * (1/(ed+1)) * (1/(jd+1)) #New Code if jellyfish.metaphone(wrongWord) == jellyfish.metaphone(candidate): score = score+0.1 if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate): score = score+0.1 if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate): score = score+0.1 if jellyfish.match_rating_codex(wrongWord) == jellyfish.match_rating_codex(candidate): score = score+0.1 tmpCandidate = ScoreRcd(candidate,ed, score) ; candidateDistList.append(tmpCandidate) candidateDistList.sort() maxIter = 10 if len(candidateDistList) < maxIter: maxIter = len(candidateDistList) for i in range(0,maxIter): out = out + candidateDistList[i].getScore() + ' ' print (out)
def getInputAndSuggestPerWord(uname, pokemons_names_list, pokemon_url_mapping, print_output = True): pokemons_names_list, pokemon_url_mapping = get_pokemons_names() best_rep = {} for pokemon_name in pokemons_names_list: # if not pokemon_name.startswith("cascoon"): if len(pokemon_name) < len(uname) + 2: continue # Finding the pokemon names matching the user name # getting substrings of pokemon name if len(uname) > 5: psubs = get_all_substrings(pokemon_name, 3 + int(len(uname)/8), len(uname) + 2) else: psubs = get_all_substrings(pokemon_name, 2, len(uname) + 2) similar_subs = [] best_sub_rep = {} for psub in psubs: psub_phone = jellyfish.nysiis(psub) # getting substing of user name to compare with substrings of pokemon names usubs = get_all_substrings(uname, int(len(uname) * 0.75), len(uname), True) for usub in usubs: name_diff = getDiff(psub, usub) # getting string diff uphonic = jellyfish.nysiis(usub) phone_diff = getDiff(psub_phone, uphonic) # getting phonic diff best_sub_rep[name_diff + phone_diff] = [psub, pokemon_name, name_diff, phone_diff, usub] # print("psub : ", psub, " psub phone : ", psub_phone, " uname : ", uname, " uphonic : ", uphonic) # print("jerro wicker distance names : ", name_diff) # print("jerro wicker distance phone : ", phone_diff) list_keys = list(best_sub_rep.keys()) list_keys = sorted(list_keys, reverse=True)[:5] for key in list_keys: if key > 1.35: # Threshold match of phonic and text diff if key in best_rep: best_rep[key].append(best_sub_rep[key]) else: best_rep[key] = [best_sub_rep[key]] # Getting final best pokemon names matching, and using user's name list_keys = list(best_rep.keys()) list_keys = sorted(list_keys, reverse=True) output_res = {} for list_key in list_keys: for rep in best_rep[list_key]: pokemonified_name = rep[1].replace(rep[0], rep[4]) # this is done to avoid results matching the pokmon name exactly, # to take the longest string, and only one username for each pokemon if pokemonified_name != rep[1]: if (rep[1] not in output_res) or (len(pokemonified_name) > len(output_res[rep[1]]['updated_name'])): output_res[rep[1]] = {'updated_name': pokemonified_name.capitalize(), 'url': pokemon_url_mapping[rep[1]], 'pokemon_name': rep[1].capitalize(), 'similarity': rep[3]} if len(list(output_res.keys())) > 5: # break if we have more than 5 results break # Now, no need of pokemon name hashing. output_res = list(output_res.values()) # based on similarity, sorting the values to get most relevent result on top output_res = sorted(output_res, key=itemgetter('similarity'), reverse = True)[:6] if print_output: print("Our best suggestions results ::: ") for res in output_res: print("Nikname : ", res['updated_name'], " Based on pokemon : ", res['pokemon_name'], " rep : ", res) print() return output_res # while(1): # uname = input("Enter username : ") # getInputAndSuggest(uname)
import sys import jellyfish if __name__ == "__main__": if len(sys.argv) != 3: print("Please provide two words as arguments.") exit() word1 = sys.argv[1] word2 = sys.argv[2] print(word1, word2) print("Edit distance: {0}".format( jellyfish.levenshtein_distance(word1, word2))) print("Phonetic Encodings") print("{0}: {1}".format(word1, jellyfish.nysiis(word1))) print("{0}: {1}".format(word2, jellyfish.nysiis(word2)))
def main(): # declare test strings # rem: u prefix is required jellyfish convention str1 = u'Jellyfish' str2= u'Smellyfish' # test Phonetic Encoding print('\nPhonetic Encoding ----------------------------') # Metaphone r1 = jellyfish.metaphone(str1) r2 = jellyfish.metaphone(str2) print('Metaphone: ', r1, ", ", r2) # American Soundex r1 = jellyfish.soundex(str1) r2 = jellyfish.soundex(str2) print('Soundex: ', r1, ", ", r2) # NYSIIS r1 = jellyfish.nysiis(str1) r2 = jellyfish.nysiis(str2) print('NYSIIS: ', r1, ", ", r2) # Match Rating Codex r1 = jellyfish.match_rating_codex(str1) r2 = jellyfish.match_rating_codex(str2) print('Match Rating Codex: ', r1, ", ", r2) # test Stemming print('\nStemming -------------------------------------') pStr1 = u'Jellyfished' pStr2 = u'Smellyfishing' r1 = jellyfish.porter_stem(str1) r2 = jellyfish.porter_stem(str2) print('Porter Stemmer: ', r1, ", ", r2) # test String Comparison print('\nString Comparisons ---------------------------') # Levenshtein Distance r = jellyfish.levenshtein_distance(str1, str2) print('Levenshtein Distance: ', r) # Damerau-Levenshtein Distance r = jellyfish.damerau_levenshtein_distance(str1, str2) print('Damerau-Levenshtein Distance: ', r) # Hamming Distance result = jellyfish.hamming_distance(str1, str2) print('Hamming Distance: ', r) # Jaro Distance result = jellyfish.jaro_distance(str1, str2) print('Jaro Distance: ', r) # Jaro-Winkler Distance result = jellyfish.jaro_winkler(str1, str2) print('Jaro-Winkler Distance: ', r) # Match Rating Approach (comparison) r = jellyfish.match_rating_comparison(str1, str2) print('Match Rating Comparison: ', r) # end program print('Done.')
import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish') #'JLLFSH' import jellyfish print jellyfish.levenshtein_distance('jellyfish', 'smellyfish') #2 print jellyfish.jaro_distance('jellyfish', 'smellyfish') #0.89629629629629637 print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') #1 print jellyfish.metaphone('Jellyfish') #'JLFX' print jellyfish.soundex('Jellyfish') #'J412' print jellyfish.nysiis('Jellyfish') #'JALYF' print jellyfish.match_rating_codex('Jellyfish')
def nysiis(s): return None if s == None else J.nysiis(s)
def apply(self, s): return s.apply(lambda x: nysiis(x) if pd.notnull(x) else np.nan)
def transform(self, data): if isinstance(data, basestring): return nysiis(unicode(data))
# Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查 print(Levenshtein.jaro_winkler('hello', 'helol')) # 0.9533333333333333; 计算 Jaro – Winkler 距离
def nysiis(s): return jellyfish.nysiis(s)