def compare_two_phrases(self, f_phrase, s_phrase, compare_from_end=False): f_concat_phrase = "".join(f_phrase) s_concat_phrase = "".join(s_phrase) f_compare = "" s_compare = "" if len(f_concat_phrase) < self.small_phrase_border: # comprare words if not compare_from_end: f_compare = f_concat_phrase[:len(f_concat_phrase)] s_compare = s_concat_phrase[:len(f_concat_phrase)] else: f_compare = f_concat_phrase[-len(f_concat_phrase):] s_compare = s_concat_phrase[-len(f_concat_phrase):] self.last_result = jellyfish.jaro_winkler_similarity( f_compare, s_compare) else: # compare concated chunks if not compare_from_end: f_compare = f_concat_phrase[:self.small_phrase_border] s_compare = s_concat_phrase[:self.small_phrase_border] else: f_compare = f_concat_phrase[-self.small_phrase_border:] s_compare = s_concat_phrase[-self.small_phrase_border:] self.last_result = jellyfish.jaro_winkler_similarity( f_compare, s_compare) out_res = False out_res = True if self.last_result > self.mistakes_border else False # debug line # print("f_compare: " + f_compare + " s_compare: " + s_compare + " res " + str(self.last_result)) return out_res
def name_matcher(original_matriz, matriz_to_merge, column_with_nan_spaces, n): # Combino los dataframes por nombre del municipio final_with_errors = pd.merge(original_matriz, matriz_to_merge, on='Municipality', how='outer') # Tomo los municipios que no obtuvieron coincidencia por nombre matriz_with_wrong_names = final_with_errors.iloc[n:, :] print("==================================================") print("Matriz con nombres equivocados") print("==================================================") print(matriz_with_wrong_names) matriz_with_blanks = final_with_errors[np.isnan( final_with_errors[column_with_nan_spaces])] print("==================================================") print("Matriz con espacios vacíos") print("==================================================") print(matriz_with_blanks) for i in matriz_with_wrong_names['Municipality']: score = 0 winner = '' for j in matriz_with_blanks['Municipality']: if jf.jaro_winkler_similarity(i, j) >= score: score = jf.jaro_winkler_similarity(i, j) winner = j print(f'{i} was replaced for {winner}') matriz_to_merge.loc[matriz_to_merge['Municipality'] == i, 'Municipality'] = winner
def __query(self, index: Index, text: str, domains: Set[str]) -> pd.DataFrame: q = self.name_parser.parse(text) with index.searcher() as s: results = [] for hit in s.search(q, limit=6): ds = set((hit.get('domains') or '').split(',')) results.append({ 'raw_score': hit.score, 'id': hit['id'], 'name': hit['name'], 'domains_boost': self.matching_domains_boost if len(ds & domains) > 0 else 1 }) if len(results) == 0: return pd.DataFrame() df = pd.DataFrame.from_records(results, index='id') # Compute accurate score based on string similarity (lowercased) df['score'] = df['name'].apply( # "Sharpen" the similarity to make it more intuitive lambda name: jellyfish.jaro_winkler_similarity( name.lower(), text.lower())**1.5) df['score'] = df['score'] * df[ 'domains_boost'] / self.matching_domains_boost df = df.sort_values(by='score', ascending=False) return df.reset_index(drop=True)
def jaro_winkler_sim(self): self.cluster = [] for i in range(0,len(self.group)): for j in range(i+1, len(self.group)): if self.threshold <= jf.jaro_winkler_similarity(str(self.group[i]),str(self.group[j])): self.cluster.append([str(self.group[i]),str(self.group[j])]) return self.cluster
def simple_example(): # String comparison. str1, str2 = u'jellyfish', u'smellyfish' print("jellyfish.levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.levenshtein_distance(str1, str2))) print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format( str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2))) print("jellyfish.hamming_distance({}, {}) = {}.".format( str1, str2, jellyfish.hamming_distance(str1, str2))) print("jellyfish.jaro_distance({}, {}) = {}.".format( str1, str2, jellyfish.jaro_distance(str1, str2))) print("jellyfish.jaro_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_similarity(str1, str2))) print("jellyfish.jaro_winkler({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler(str1, str2))) print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format( str1, str2, jellyfish.jaro_winkler_similarity(str1, str2))) print("jellyfish.match_rating_comparison({}, {}) = {}.".format( str1, str2, jellyfish.match_rating_comparison(str1, str2))) #-------------------- # Phonetic encoding. ss = u'Jellyfish' print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss))) print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss))) print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss))) print("jellyfish.match_rating_codex({}) = {}.".format( ss, jellyfish.match_rating_codex(ss)))
def fuzzy_contact_name_match(self, search_name, monica_contact_list, my_name, benchmark=0.85): all_score = [] if search_name!=my_name: for monica_contact in monica_contact_list: score = jellyfish.jaro_winkler_similarity(search_name, monica_contact) all_score.append(score) name_matched = self.find_max_score_name(monica_contact_list, all_score, benchmark) return name_matched
def jaro_winkler_apply(x): try: return jaro_winkler_similarity(x[0], x[1]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan else: raise err
def alternative_search(element, script_list, best_match, best_match_script, i): ''' This function using Jaro Winkler similarity will be used if NLTK doesn't find a sufficiently good match. Overall this improves accuracy. ''' bm = best_match for a in range(15): if i - a >= 0: if jellyfish.jaro_winkler_similarity(element, script_list[i - a]) > bm: bm = jellyfish.jaro_winkler_similarity(element, script_list[i - a]) best_match_script = script_list[i - a] if i + a < len(script_list): if jellyfish.jaro_winkler_similarity(element, script_list[i + a]) > bm: bm = jellyfish.jaro_winkler_similarity(element, script_list[i + a]) best_match_script = script_list[i + a] return bm, best_match_script
def lemmatize(tokens: List[str]): """ Accepts a list of tokens and returns a list containg lemmas of those tokens. Input(s): 1) tokens - A list containg tokens to be lemmatized. Output(s): 1) lemma_list - A list containing lemmas of tokens for which lemmas could be found and the tokens themselves for which no lemmas were found. """ lemma_list = [] for token in tokens: bigrams = get_character_ngrams(generate_stem_words(token), 2) options = db.search(where('letter') == token[0]) options = options[0] if options else options similarity_score = len(token) if options: if token in options["words"]: lemma_list.append(token) else: candidates = [] for lemma in options["words"]: temp = get_distance(token, lemma) if (temp != -1) and (temp <= similarity_score): similarity_score = temp candidates.append(lemma) else: pass similarity_score = 0.0 jw_similarity_score = 0.0 add = "" for i in candidates: cand_big = options["words"][i] temp = similarity(bigrams, cand_big) temp_jw = jaro_winkler_similarity(token, i) if (temp > similarity_score) and (temp_jw > jw_similarity_score): similarity_score = temp jw_similarity_score = temp_jw add = i if round(similarity_score) == 1: lemma_list.append(add) else: lemma_list.append(token) else: lemma_list.append(token) return list(zip(tokens, lemma_list))
def mra_1_to_all(word, all_words, threshold): similar_list = [] for j, w2 in enumerate(all_words): if word == w2: # skip -- same word continue # Must similar according to Match Rating Comparison (similarity on MRA hashes) if jellyfish.match_rating_comparison(word, w2): # And also score must be higher than threshold if jellyfish.jaro_winkler_similarity(word, w2) >= threshold: similar_list.append(w2) return similar_list
def measure_distance(word1, word2, distance_type): if distance_type == 'lv': distance = Levenshtein.eval(word1, word2) if distance_type == 'dlv': distance = jellyfish.damerau_levenshtein_distance(word1, word2) if distance_type == 'jw': # Jaro–Winkler indicates the similiraty, we take the inverse distance = -jellyfish.jaro_winkler_similarity(word1, word2) if distance_type == 'j': distance = -jellyfish.jaro_similarity(word1, word2) if distance_type == 'hm': distance = jellyfish.hamming_distance(word1, word2) return distance
def fuzzy_match(s1, s2, max_dist=.8): ''' Fuzzy match the given two strings with the given maximum distance Args: s1: string: First string s2: string: Second string max_dist: float: The distance - default: 0.8 Returns: float: jellyfish jaro_winkler_similarity based on https://en.wikipedia.org/wiki/Jaro-Winkler_distance ''' return jellyfish.jaro_winkler_similarity(s1, s2) >= max_dist
def get_nearest_string_from_list(string, string_list, threshold=0.75): matching_item = None closest_dist = threshold for list_item in string_list: dist = jellyfish.jaro_winkler_similarity(string.lower(), list_item.lower()) if dist > closest_dist: matching_item = list_item closest_dist = dist print(f" {string} {list_item} {dist}") if matching_item is None: print("No Match Found.") raise ValueError() print(f"Match: {string} {matching_item} {closest_dist}") return matching_item, closest_dist
def comparacion_pares(self, texto1, texto2, tipo="levenshtein", norm=None): """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a \ un tipo de distancia o similitud determinado. :param texto1: Primer texto de interés a comparar. :type texto1: str :param texto2: Segundo texto de interés a comparar. :type texto2: str :param tipo: Criterio de comparación a utilizar entre los textos. \ Valor por defecto `'levenshtein'`. :type tipo: {'damerau_levenshtein', 'levenshtein', 'hamming', \ 'jaro_winkler', 'jaro'}, opcional :param norm: Permite normalizar los resultados en función de la \ longitud de los textos. Si `norm = 1` se normaliza en función al \ texto más corto, si `norm = 2` se normaliza en función al texto \ de mayor extensión. :type norm: {1,2}, opcional :return: (float) Valor resultado de la comparación entre `texto1` y \ `texto2`. """ tipo = tipo.lower() if "damerau" in tipo: salida = jellyfish.damerau_levenshtein_distance(texto1, texto2) elif "levenshtein" in tipo: salida = jellyfish.levenshtein_distance(texto1, texto2) elif "hamming" in tipo: salida = jellyfish.hamming_distance(texto1, texto2) elif "winkler" in tipo: salida = jellyfish.jaro_winkler_similarity(texto1, texto2) elif "jaro" in tipo: salida = jellyfish.jaro_similarity(texto1, texto2) else: print( ( "Por favor seleccione un criterio válido " "para comparar los strings." ) ) return None if norm in [1, 2] and "jaro" not in tipo: if norm == 1: salida /= min(len(texto1), len(texto2)) else: salida /= max(len(texto1), len(texto2)) return salida
def get_noise_results(self, orig_word, list_of_similar): similar = [] added_words = set({}) for w2 in list_of_similar: if orig_word == w2: continue score = jellyfish.jaro_winkler_similarity(orig_word, w2) if score >= self.threshold and w2 not in added_words: # and score must be higher than threshold similar.append((w2, score)) # word, pronunciation, score added_words.add(w2) # few or zero results .. if len(similar) < self.cnt_error_samples: return similar # else return top scoring # similar = sorted(similar, key=lambda entry: entry[1], reverse=True) # by score from greatest return similar[:self.cnt_error_samples]
def string_similarity(string1, string2): """ Args: string1 (str): Primeira string que vai ser comparada string2 (str): Segunda string que vai ser comparada Returns: float: O quão similar são as strings, podendo ir de 0.0 a 1.0 Examples: >>> string_similarity('string 1', 'string 1') 1.0 >>> string_similarity('string 1', 'string 2') 0.95 >>> string_similarity('abc', 'bcd') 0.0 >>> string_similarity('apple', 'appel') 0.9533333333333333 """ return jaro_winkler_similarity(str(string1), str(string2))
def _similarity_compare(words: list, compare_to): if type(compare_to) != list: compare_to = [compare_to] points = 0 running_total = 0 for index, item in enumerate(compare_to): if len(item.split(" ")) > 1: compare_to.pop(index) [compare_to.append(c) for c in item.split(" ")] for c in compare_to: for w in words: score = jellyfish.jaro_winkler_similarity(w.lower(), c.lower()) if score != 0.0: running_total += score points += 1 if points != 0: running_total /= points return running_total
def comparacion_pares(self, texto1, texto2, tipo='levenshtein', norm=None): """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a un tipo de \ distancia o similitud determinado. :param texto1: (str) Primer texto de interés a comparar. :param texto2: (str) Segundo texto de interés a comparar. :param tipo: (str) {'damerau_levenshtein', 'levenshtein', 'hamming', 'jaro_winkler', \ 'jaro'} Valor por defecto: 'levenshtein'. Criterio de comparación a utilizar entre los textos. :param norm: (int) {1, 2} Valor por defecto: None. Permite normalizar \ los resultados en función de la longitud de los textos. \ Si norm=1 se normaliza en función al texto más corto, \ si norm=2 se normaliza en función al texto de mayor extensión. :return: (float o int) Valor resultado de la comparación. """ tipo = tipo.lower() if 'damerau' in tipo: salida = jellyfish.damerau_levenshtein_distance(texto1, texto2) elif 'levenshtein' in tipo: salida = jellyfish.levenshtein_distance(texto1, texto2) elif 'hamming' in tipo: salida = jellyfish.hamming_distance(texto1, texto2) elif 'winkler' in tipo: salida = jellyfish.jaro_winkler_similarity(texto1, texto2) elif 'jaro' in tipo: salida = jellyfish.jaro_similarity(texto1, texto2) else: print( 'Por favor seleccione un criterio válido para comparar los strings.' ) return None if norm in [1, 2] and 'jaro' not in tipo: if norm == 1: salida /= min(len(texto1), len(texto2)) else: salida /= max(len(texto1), len(texto2)) return salida
def jaro_winkler_distance(A, B): return 1 - jaro_winkler_similarity(A, B)
def jaroDistance(x, y): d = 1 - jellyfish.jaro_winkler_similarity(x, y) return d
print(f"Node {node1} is not found in network `{network}`.") try: node2_neighbors.extend( [x for x in networks_dict[network].neighbors(node2)]) except nx.exception.NetworkXError: print(f"Node {node2} is not found in network `{network}`.") return (sorted(list(set(node1_neighbors))), sorted(list(set(node2_neighbors)))) similar_names = [] for name in performer_names: for cmp in [ x for x in performer_names if not x == name and not "unnamed" in x.lower() ]: fsh = jellyfish.jaro_winkler_similarity(name, cmp) if fsh > THRESHOLD: if (not (name, cmp, fsh) in similar_names and not (cmp, name, fsh) in similar_names): neighbors1, neighbors2 = compare_neighbors( name, cmp, networks) similar_names.append( (name, cmp, fsh, neighbors1, neighbors2)) file_name = f"{PREFIX}-report-similar-names.json" if similar_names: with open("network-app/data/" + file_name, "w+") as fp: json.dump(obj=similar_names, fp=fp) print(
def getTermScore(node): s1 = (' ').join(node.cleanSearchedTerm) s2 = (' ').join(node.cleanTermTokens) return jaro_winkler_similarity(s1, s2)
def jar(self): return jellyfish.jaro_winkler_similarity( self.translatable_word.english_word, self.readable_word)
def jaro_winkler(a, b): return jf.jaro_winkler_similarity(a, b)
return float(numerator) / denominator def textToVector(text): words = WORD.findall(text) return Counter(words) second = open("reinterpreted_file_1.txt", "r") first = open("reinterpreted_file_2.txt", "r") text1 = second.read() text2 = first.read() vector1 = textToVector(text1) vector2 = textToVector(text2) cosine = calculateCosineSimilarity(vector1, vector2) data = { 'cosine': cosine, 'jaro_similarity': jellyfish.jaro_similarity(text1, text2), 'jaro_winkler_similarity': jellyfish.jaro_winkler_similarity(text1, text2), 'levenshtein_distance': jellyfish.levenshtein_distance(text1, text2), 'damerau_levenshtein_distance': jellyfish.damerau_levenshtein_distance(text1, text2), 'hamming_distance': jellyfish.hamming_distance(text1, text2) } with open('results.txt', 'w') as outfile: json.dump(data, outfile)
def submit_answers(request, langy_session_id): if request.method == 'POST': # Get, update and save LangySession langy_session = get_object_or_404(LangySession, pk=langy_session_id) langy_session.end_time = timezone.now() langy_session.save() # Get data from the request json_data = json.loads(request.body) answers = json_data['answers'] if (len(answers)==0): return HttpResponseBadRequest('No answers received in request') # Prepare to create a response with results and create new LearningTraces response_results = [] for answer in answers: translation = get_object_or_404(Translation, pk=answer['translation_id']) # Get user answer, ignore capitalisation user_english = answer['user_english'].lower() # Get correct answer(s) true_english = translation.translatable_word.english_word.lower() synonyms = [syn.english_word for syn in translation.translatable_word.synonyms.all()] if len(synonyms) != 0: # Find the closest word to the user's input max_sim = jellyfish.jaro_winkler_similarity(user_english, true_english) for syn in synonyms: sim = jellyfish.jaro_winkler_similarity(user_english, syn) if sim > max_sim: max_sim = sim true_english = syn # Evaluate user answer correct = user_english == true_english typo = False # Typos: plurals # Allow missing or additional 's' # Some foreign words e.g. Swedish "djur" (animal/animals) are the same for singular/plural if (user_english == true_english+'s' or user_english+'s' == true_english): correct = True typo = True # Typos: typing error tolerance # Allow one accidental character insertion, deletion, substitution or transposition if jellyfish.damerau_levenshtein_distance(user_english, true_english) == 1: correct = True typo = True # Add result to list for response response_results.append({ 'translation_id': answer['translation_id'], 'true_english': true_english, 'correct': correct, 'typo': typo, }) # Prepare to create a new LearningTrace # Find previous LearningTrace object for this Translation prev = (request.user.traces .filter(translation=translation) .filter(translation__foreign_language = request.user.active_language.foreign_language) .last()) if prev is None: continue # next answer LearningTrace.objects.create( session = langy_session, user = request.user, # Tracing translation = translation, prev = prev, # Statistics seen = prev.seen + 1, interacted = prev.interacted, tested = prev.tested + 1, correct = prev.correct + 1 if correct else prev.correct, ) return JsonResponse({ 'results': response_results }) else: return HttpResponseBadRequest('Invalid request method')
inFile = open('dump-sorted-uniq.txt', mode = 'r') lines = inFile.read() print(lines) inFile.close() myAuthors = [] count = 0 # split on non-espcaped ' currLineCommaSep = lines.split(", ") print("Quote separated: "+str(currLineCommaSep)) # parse input -- everything between non-escaped quotes is a new author to add to list for newAuthor in currLineCommaSep: if (newAuthor is not "" and newAuthor is not "\n" and not("," in newAuthor)): print("Found author #"+str(count)+" : "+str(newAuthor)) # add each author to list myAuthors.append(newAuthor.strip()) count = count + 1 numAuthors = len(myAuthors) # now that we have all authors, run deduplication -- print out likely matches for currAuthor in range(numAuthors): restOfAuthors = currAuthor + 1 # compare author with all subsequent authors, searching for close matches while (restOfAuthors < numAuthors): #print("Comparing "+str(currAuthor)+" to "+str(restOfAuthors)) similarity = jellyfish.jaro_winkler_similarity(myAuthors[currAuthor], myAuthors[restOfAuthors]) # arbitrary threshold at the moment -- anectdotally, anything less than this leads to a large number of false positives if (similarity > 0.94): print("Similarity ("+str(similarity)+"): "+myAuthors[currAuthor]+", "+myAuthors[restOfAuthors]) restOfAuthors += 1
def jaro_winkler_similarity(s1, s2): return None if s1 == None or s2 == None else J.jaro_winkler_similarity( s1, s2)