def three_recommended_items(request, id): """ Fuznione che si occupa dei prodotti consigliati per l'utente, controllando di consigliare il prodotto stesso. Sfrutta il pacchetto textdistance e la similarità di levenshtein. :param: id: id del prodotto :return: Ritorna la lista dei prodotti consigliati """ all_products = Product.objects.all() name_products = Product.objects.get(id=id) nome = name_products.name user_products = Product.objects.filter(user__email=request.user.email) all_products = all_products.difference(user_products) all_products_names = [] for p in all_products: all_products_names.append(p.name) lista_nomi_prodotti = all_products_names lista_nomi_prodotti.remove(nome) user_products_names = [] for p in all_products: user_products_names.append(p.name) if len(all_products_names) < 3: return 0 import textdistance list = [[ user_products_names[0], all_products_names[0], round( textdistance.levenshtein(user_products_names[0], all_products_names[0]), 4) ], [ user_products_names[0], all_products_names[1], round( textdistance.levenshtein(user_products_names[0], all_products_names[0]), 4) ], [ user_products_names[0], all_products_names[2], round( textdistance.levenshtein(user_products_names[0], all_products_names[0]), 4) ]] # Jaro–Winkler distance is a measure of edit distance which gives more similar measures to words in which # the beginning characters match. from django.db.models import Q list = Product.objects.filter( Q(name=list[0][1]) | Q(name=list[1][1]) | Q(name=list[2][1])) return list
def dictionary_searcher(query, dictionary): # Let's find the most similar disease to what we've got min_distance = textdistance.levenshtein(query, list(dictionary.keys())[0]) for dict_key in dictionary: distance = textdistance.levenshtein(query, dict_key) if distance < min_distance: min_distance = distance nearest_key = dict_key nearest = dictionary[dict_key] return nearest_key, nearest
def levenshtein_accuracy(input, output, token): if (token == True): gap = len(output) - len(input) if (gap == 0): sum = 0 for i in range(len(input)): sum += td.levenshtein(input[i], output[i]) accuracy = sum / len(input) elif (gap < 0): #output less tokens than input, original > generated i = 0 j = 0 gap = abs(gap) tokens = [0] * gap else: #output more tokens than input, original < generated i = 0 j = 0 else: input_string = ' '.join(input) output_string = ' '.join(output) print(input_string) print(output_string) accuracy = td.levenshtein.normalized_similarity( input_string, output_string) return accuracy
def merge_substrings(entities): """ This function eliminates entities which are already substrings of other entities. e.g.: input:['Ana Lourenço', 'Ana Dias Lourenço', 'Ana Afonso Dias Lourenço'] output: ['Ana Afonso Dias Lourenço'] Based on the principle that if a polysemous word appears two or more times in a written discourse, it is extremely likely that they will all share the same sense. (see: https://www.aclweb.org/anthology/H92-1045.pdf) """ new_entities = [] # sort the locations by size entities_sorted = sorted( [EntityLinking.clean_entity(x) for x in entities], key=len) # starting with the shortest one see if it's a substring of any of the longer ones for idx, x in enumerate(entities_sorted): found = False for other in entities_sorted[idx + 1:]: if x in other or textdistance.levenshtein(x, other) <= 3: found = True break if not found and x not in new_entities: new_entities.append(x) return new_entities
def getRegionTypoRecommendation(s): r = getRegionNames() if (s.isupper()): s2 = s.lower() else: s2 = s return min(r, key=lambda x: levenshtein(s2, x))
def BFS(self, letters, words, index): if len(letters) == 0: return index if letters[0] != words[0][0]: return -1 if len(letters) == 1: return index + 1 if len(words) == 1: if textdistance.levenshtein( words[0], letters) == len(words[0]) - len(letters): return index + 1 else: return -1 Queue = [] Queue.append((letters[1:], words[1:])) i, j = 1, 1 for i in range(1, len(letters), 1): if j != len(words[0]) and letters[i] in words[0][j:]: Queue.append((letters[i + 1:], words[1:])) j += 1 # print(Queue) while len(Queue) != 0: (l, w) = Queue.pop(0) temp = self.BFS(l, w, index + 1) # print(temp) if temp == -1 and len(Queue) == 0: return temp elif temp != -1: return temp
def distanceLevenhstein(centroid: str, mutants: List[str], latent_mutants: np.ndarray = None, latent_mutants_encoder: np.ndarray = None) -> Dict: lev = [levenshtein(centroid, mutant) for mutant in mutants] min = float(np.min(lev)) max = float(np.max(lev)) avg = float(np.mean(lev)) var = float(np.var(lev)) rang = max - min if latent_mutants is not None: latent_distances = pdist(latent_mutants, 'euclidean') latent_distances_encoder = pdist(latent_mutants_encoder, 'euclidean') corrOuts = corrLevenshtein(mutants, latent_distances, latent_distances_encoder) else: corrOuts = {} return { **corrOuts, **{ 'avg_l': avg, 'var_l': var, 'max_l': max, 'min_l': min, 'rang_l': rang } }
def ref_levenshtein(parser_N, src_name, ld_max): """ Info: calculate the adjacency matrix with the normal textdistance Levenshtein distance to verify if our algorithm is correct included Args: ii Returns: - """ plotData.clusterMinLen = 1 plotData.N = parser_N #10**6#4*10**3 plotData.min_ldVal = -1 plotData.maxVal = 3 plotData.max_ldVal = plotData.maxVal gpu_l = 8000 #5000 step = 0 _, _, seq, _, _ = dic.loadSequence(step, plotData, isExtractNum=False, src=src_name) a = np.zeros([len(seq), len(seq)]) for i in range(len(seq)): for j in range(len(seq)): dist = td.levenshtein(seq[i], seq[j]) a[i][j] = 1 if dist < ld_max else 0 name = "two_number_indices_ref.txt" with open(name, 'w') as f: np.savetxt(f, a, fmt='%i') print("\n saved under: ", name)
def find_route_list( list_eng, route, check=0 ): #check value determines whether the function find number of dest(check = 0) or get the list of matched dest count = 0 index = 0 for i in range(len(list_eng)): ratio = 0 for j in range(len(route)): test_ratio = sm(None, list_eng[i], route[j]).ratio( ) #for calculating similarity ratio between strings val = td.levenshtein( list_eng[i], route[j] ) #for calculating levenshtein distance between strings if ((test_ratio > 0.73) and (test_ratio > ratio) and (val < 5)): ratio = test_ratio index = j #index of most probable destination if (ratio > 0.73): if (check == 0): #print(list_eng[i]," has ratio ",ratio, "with" ,route[index]," AT", index,"\n") count = count + 1 elif (check == 1): list_final.append(route[index]) if (check == 0): count_final.append(count)
def levenshtein(c0, centres, dim): c0 = tuple(c0) distances = np.empty(len(centres)) for idx, c1 in enumerate(centres): c1 = tuple(c1) dis = textdistance.levenshtein(c0, c1) distances[idx] = dis return distances
def levenshtein(string1: str, string2: str) -> int: """The minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other. https://en.wikipedia.org/wiki/Levenshtein_distance """ dist: int = textdistance.levenshtein(string1, string2) return dist
def get_recruit_tags(img): import textdistance vw, vh = common.get_vwvh(img) tagimgs = [ img.crop((50 * vw - 36.481 * vh, 50.185 * vh, 50 * vw - 17.315 * vh, 56.111 * vh)).convert('L'), img.crop((50 * vw - 13.241 * vh, 50.185 * vh, 50 * vw + 6.111 * vh, 56.111 * vh)).convert('L'), img.crop((50 * vw + 10.000 * vh, 50.185 * vh, 50 * vw + 29.259 * vh, 56.111 * vh)).convert('L'), img.crop((50 * vw - 36.481 * vh, 60.278 * vh, 50 * vw - 17.315 * vh, 66.019 * vh)).convert('L'), img.crop((50 * vw - 13.241 * vh, 60.278 * vh, 50 * vw + 6.111 * vh, 66.019 * vh)).convert('L') ] tagimgs = [ Image.fromarray( cv2.threshold(img.array, 127, 255, cv2.THRESH_BINARY_INV)[1]) for img in tagimgs ] eng = ocr.acquire_engine_global_cached('zh-cn') recognize = lambda img: eng.recognize(img, int(vh * 20), hints=[ocr.OcrHint.SINGLE_LINE], char_whitelist=known_tagchars ).text.replace(' ', '') cookedtags = [] for img in tagimgs: logger.logimage(img) tag = recognize(img) logger.logtext(tag) if not tag: continue if tag in known_tags: cookedtags.append(tag) continue distances = [(target, textdistance.levenshtein(tag, target)) for target in known_tags.difference(cookedtags)] distances.sort(key=lambda x: x[1]) mindistance = distances[0][1] matches = [x[0] for x in distances if x[1] == mindistance] if mindistance > 2: logger.logtext('autocorrect: minimum distance %d too large' % mindistance) cookedtags.append(tag) elif len(matches) == 1: logger.logtext('autocorrect to %s, distance %d' % (matches[0], mindistance)) cookedtags.append(matches[0]) else: logger.logtext( 'autocorrect: failed to match in %s with distance %d' % (','.join(matches), mindistance)) cookedtags.append(tag) return cookedtags
def get_reward(reference, hypothesis, source): #return gleu_calc.sentence_gleu(hypothesis, reference, source) try: reward = reward_function((reference, hypothesis)) except: return - textdistance.levenshtein(reference, hypothesis) #return - Levenshtein.distance(reference, hypothesis) #print("reward",reward) return reward
def motSimilaire(motCle, Dico): liste = [] if (len(motCle) > 2): for keys in Dico.keys(): if textdistance.levenshtein(motCle, keys) < 2: liste.append(keys) else: liste.append(motCle) return liste
def Levenshtein(str1, match_against): best_match = ['', 10000000] str_comparison = [[x, textdistance.levenshtein(str1, x)] for x in match_against] for item in str_comparison: if item[1] < best_match[1]: best_match = item return best_match
def findClosestStopName(self, input): stoplist = self.stopslist input = input.lower().replace(" ", "_") scores = [ textdistance.levenshtein(input, x[0].lower().replace(" ", "_")) for x in stoplist ] best_name = stoplist[scores.index(min(scores))] return [best_name, min(scores)]
def get_winner(self, notes): top_chords = self.top_chords.copy() query = notes_to_chroma(librosa.midi_to_note(notes)) top_chords['chroma_dist'] = np.array([ textdistance.levenshtein(query, chord) for chord in top_chords.chroma.values ]) top_chords['dist'] = np.array([ textdistance.levenshtein(notes, chord) for chord in top_chords.midi_notes ]) top_chords['chroma_dist'] = top_chords.chroma_dist.max( ) - top_chords.chroma_dist # top_chords['dist'] = np.array([len(textdistance.lcsseq(query, chord)) for chord in top_chords.chroma.values]) candidates = top_chords[top_chords.dist == top_chords.dist.min()].sort_values( ['dist', 'chroma_dist'], ascending=False) winner = candidates.iloc[0] return winner.midi_notes
def _check_lieferando(entry_name: str, city: str) -> Optional[float]: propabilities = list() for e in restaurants: levenshtein_distance = textdistance.levenshtein(entry_name.lower(), e.lower()) distance = max(1 - (levenshtein_distance / len(e)), 0) propabilities.append(distance) highest_propability = sorted(propabilities)[-1:] return highest_propability[0]
def distance(ground_truth, recognised_text): """Return normalized number of edits between tokens. Edits are removal, insertion and replacements, weighted equally. Number of edits is divided by number of tokens (in the ground_truth). Lower number is better. """ yield textdistance.levenshtein(ground_truth, recognised_text) / len(ground_truth)
def compare(self,str1,str2): if self.debug: self.log("levenshtein comparison") self.start_time() self.result.distance = levenshtein(str1,str2) self.end_time() self.result.nos = max(len(str1),len(str2)) self.result.threshold = 90 self.result.similarity = (100.0 / float(self.result.nos)) * (self.result.nos - self.result.distance) return self.result
def rank_hypotheses(toks, hypotheses, grams, leven_penalty=0.2, order=5): scores = [] for hypo in hypotheses: scores.append([ ' '.join(hypo), get_log_proba(hypo, grams=grams, order=order), len(hypo) ]) d = pd.DataFrame(scores) d.columns = ['text', 'lm', 'n'] d['leven'] = d.text.apply( lambda x: textdistance.levenshtein(x, ' '.join(toks))) d['penalty'] = np.log(leven_penalty) * d.leven d['score'] = d.lm + d.penalty d.sort_values('score', ascending=False, inplace=True) return d
def assignRemoveAttributesTitles(self): """ If there are columns in the csv files that are attributes of a larger class, this function will appropriately group them in the appropriate dictionary. A column is considered an attribute of another column if it's title is of the following structure: columnname_attributename E.g.verb_translation given that there is a separate column entitled 'verb'. """ primary = set() attribute = {} headers = set() for col_name in self.headers: if self.delimiter not in col_name: headers.add(col_name) if self.order: primary = set(self.order) for o in primary: if o not in headers: print("------------------") print( "[FATAL ERROR]\n{} - Invalid column name. Check order.csv file." .format(o)) mispelt = list() for h in headers: if textdistance.levenshtein(o, h) <= 2: mispelt.append(h) if mispelt: print("Did you mean: {}?\n".format(", ".join(mispelt))) print("------------------") sys.exit() else: primary = headers for col_name in self.headers: if self.delimiter in col_name: prim_attr = col_name.split(self.delimiter) if prim_attr[0] in primary: attribute[col_name] = { "primary": prim_attr[0], "attribute": prim_attr[1] } return primary, attribute
def heuristic_renames(vcs_system_id, revision_hash): """Return most probable rename from all FileActions, rest count as DEL/NEW. There may be multiple renames of the same file in the same commit, e.g., A->B, A->C. This is due to pygit2 and the Git heuristic for rename detection. This function uses another heuristic to detect renames by employing a string distance metric on the file name. This captures things like commons-math renames org.apache.math -> org.apache.math3. :param vcs_system_id vcs system of the commit :param revision_hash revision has of the commit for which the renames are determined :return Tuple of renames and added files. The renames are a list of tuples, where the first element in the tuple is the old name and the second element is the new name. The added files are a list. """ renames = {} commit = Commit.objects(vcs_system_id=vcs_system_id, revision_hash=revision_hash).only('id').get() for fa in FileAction.objects(commit_id=commit.id, mode='R'): new_file = File.objects.get(id=fa.file_id) old_file = File.objects.get(id=fa.old_file_id) if old_file.path not in renames.keys(): renames[old_file.path] = [] renames[old_file.path].append(new_file.path) true_renames = [] added_files = [] for old_file, new_files in renames.items(): # only one file, easy if len(new_files) == 1: true_renames.append((old_file, new_files[0])) continue # multiple files, find the best matching min_dist = float('inf') probable_file = None for new_file in new_files: d = levenshtein(old_file, new_file) if d < min_dist: min_dist = d probable_file = new_file true_renames.append((old_file, probable_file)) for new_file in new_files: if new_file == probable_file: continue added_files.append(new_file) return true_renames, added_files
def listeSimilaire(request, Dico): liste = [] taille = len(request) compteur = 1 for motCle in request: #Progress bar progress_bar_test.print_progress_bar( compteur, taille, prefix='Mot de la recherche traité : ' + str(compteur) + '/' + str(taille), suffix='') compteur = compteur + 1 if (len(motCle) > 2): for keys in Dico.keys(): if textdistance.levenshtein(motCle, keys) < 2: liste.append(keys) else: liste.append(motCle) return liste
def findClosest(txt, collection, collection_transformer=None, max_chars=None): best_score = 99999 for item in collection: if collection_transformer is not None: item = collection_transformer(item) #print(item) item_comp = item if max_chars is not None: item_comp = item[:max_chars] d = textdistance.levenshtein(txt, item_comp) if d < best_score: best_score = d best_item = item return best_item, best_score
def sequenceDistance(dfEnsp, ref_dic, newcolresult, hamming, hammingNorm, levenshtein, levenshteinNorm): res = [] ham = [] hamnorm = [] lev = [] levnorm = [] serSeq = dfEnsp['proSequence'].copy() serID = dfEnsp['stableID_key'].copy() for inx, val in serSeq.items(): pep = str(val) p = pep.strip() idd = str(serID[inx]) # check pep to dict pep sequence mypep = ref_dic[idd] str(mypep) # identical if mypep == p: res.append('True') ham.append('identical') hamnorm.append('identical') lev.append('identical') levnorm.append('identical') # not identical to canonical if mypep != p: res.append('False') # calculates hamming distance, penalizes positional differences, edit based distance ham.append(textdistance.hamming(mypep, p)) # normalized hamming = # mismatched positions/ len of longer sequence hamnorm.append(textdistance.hamming.normalized_distance(mypep, p)) # levenshtein score is edit based but not not penalized position, insertion at pos 1 is jsut 1 diff lev.append(textdistance.levenshtein(mypep, p)) levnorm.append( textdistance.levenshtein.normalized_distance(mypep, p)) dfEnsp.loc[:, newcolresult] = res dfEnsp.loc[:, hamming] = ham dfEnsp.loc[:, hammingNorm] = hamnorm dfEnsp.loc[:, levenshtein] = lev dfEnsp.loc[:, levenshteinNorm] = levnorm return dfEnsp
def inner_duplicates(hList): set_list = {h: set(h.lower().split()) for h in hList} # map of holidays to their word sets new_list = [] # output set banned = set() for i, h1 in enumerate(hList): if h1 not in banned: s1 = set_list[h1] matches = [] for h2 in hList[i + 1:]: s2 = set_list[h2] intersection = s1.intersection(s2) difference = s1.symmetric_difference(s2) if len(intersection) > 1 and 1 <= len( difference ) <= 2: # accounts for single differences (National added or not) and spelling differences (apostorphe) if len(difference) > 1: # checking edit distance chnged dataset from 5366 entries to 5588 entries if levenshtein( h1.lower(), h2.lower() ) < 3: # only a match if the two strings are very similar matches.append(h2) #print(difference, h1, h2) else: matches.append( h2 ) # if difference == 1 then it's just an addition/ subtraction of one word, they match if matches: matches.append(h1) matches.sort(key=lambda x: len(x), reverse=True) # sort by longest first new_list.append(matches[0]) # longest title is kept for m in matches: banned.add(m) else: new_list.append( h1) # no matches, h1 is unique, keep it in the list return new_list
def rank_primers(df): df['levenshtein'] = df.apply( lambda x: levenshtein(x.primer, x.parent), axis=1, ) df['gc_content'] = df.apply( lambda x: gc_content(x.primer), axis=1, ) df['gc_clamp'] = df.apply( lambda x: gc_clamp(x.primer), axis=1, ) df['gc_balance'] = abs(df.gc_content - 0.5) df.sort_values( ['gc_balance', 'levenshtein', 'gc_clamp'], ascending=[True, True, False], inplace=True, ) df.drop('gc_balance', axis='columns', inplace=True) df.reset_index(drop=True, inplace=True) return df
def Seq_StringDistance(str_seq, str_ref, method="hamming"): if (method is "hamming"): return [ textdistance.hamming(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "levenshtein"): return [ textdistance.levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "damerau_lev"): return [ textdistance.damerau_levenshtein(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "j-winkler"): return [ textdistance.jaro_winkler(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "smith-waterman"): return [ textdistance.smith_waterman(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "jaccard"): return [ textdistance.jaccard(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "sorensen-dice"): return [ textdistance.sorensen_dice(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tversky"): return [ textdistance.tversky(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "cosine"): return [ textdistance.cosine(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "tanimoto"): return [ textdistance.tanimoto(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "ratcliff"): return [ textdistance.ratcliff_obershelp(str_seq_i, str_ref) for str_seq_i in str_seq ] elif (method is "bwt"): return [ textdistance.bwtrle_ncd(str_seq_i, str_ref) for str_seq_i in str_seq ]
# In[9]: hamming = textdistance.Hamming(external=False) hamming('text', 'testit') # # Levenshtein # # https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227 # In[10]: textdistance.levenshtein('arrow', 'arow') # In[11]: textdistance.levenshtein.normalized_similarity('arrow', 'arow') # # Jaro Winkler # # https://itnext.io/string-similarity-the-basic-know-your-algorithms-guide-3de3d7346227 # In[12]: