def group_clone_VJ_cdr3(dico_same_VJ, dicoSeq, Clone_threshold): VJ_ID_diff_CDR3 = {} dicoclone_vj_cdr3 = {} for VJ_ID in dico_same_VJ.keys(): sub_sub_group = 0 #print (VJ_ID,"VJ_ID,VJ_ID") VJ_ID_diff_CDR3[VJ_ID] = {} for seq in dico_same_VJ[VJ_ID]: sub_gourp_dist = {} CDR3_seq = dicoSeq[seq.rstrip()][2] print(VJ_ID_diff_CDR3[VJ_ID].keys()) if len(VJ_ID_diff_CDR3[VJ_ID].keys()) != 0: Sub_gourp = VJ_ID_diff_CDR3[VJ_ID].keys() for g in Sub_gourp: print(g, dicoSeq[seq.rstrip()][2]) print( "aaaaa", 1 - hamming_distance(dicoSeq[seq.rstrip()][2], g) / float(len(g))) if len(dicoSeq[seq.rstrip()][2]) == len(g): if 1 - (hamming_distance(dicoSeq[seq.rstrip()][2], g) / float(len(g))) >= Clone_threshold: print("here!") sub_gourp_dist[g] = ('+', 1 - (hamming_distance( dicoSeq[seq.rstrip()][2], g) / float(len(g)))) elif dicoSeq[seq.rstrip()][1].split("*")[0][-1] == "6": length = max(len(dicoSeq[seq.rstrip()][2]), len(g)) if 1 - (levenshtein_distance(dicoSeq[seq.rstrip( )][2], g) / float(length)) >= Clone_threshold: sub_gourp_dist[g] = ( '+', 1 - (levenshtein_distance(dicoSeq[seq.rstrip()][2], g) / float(length))) if sub_gourp_dist == {}: VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq] else: dist_loc = {} for key in sub_gourp_dist.keys(): seqs = [ skbio.Protein(CDR3_seq, metadata={'id': "CDR3_seq"}), skbio.Protein(key, metadata={'id': "key"}) ] #print (seqs[0],seqs[1]) msa = skbio.alignment.global_pairwise_align_protein( seqs[0], seqs[1], 25) dist_loc[key] = float(msa[1]) print(dist_loc, "dist_loc") best_coressp = (max(dist_loc.items(), key=operator.itemgetter(1))[0]) VJ_ID_diff_CDR3[VJ_ID][best_coressp].append(seq) else: VJ_ID_diff_CDR3[VJ_ID][CDR3_seq] = [seq] print(sub_gourp_dist) #print (VJ_ID_diff_CDR3) return VJ_ID_diff_CDR3
def find_city(city1, city2): if city1 == 'U' or city2 == 'U': return 1 elif levenshtein_distance(city1, city2) <= 1: return 0 else: return -1
def find_zipcode(zip1, zip2): if zip1 == 'U' or zip2 == 'U': return 1 elif levenshtein_distance(str(zip1), str(zip2)) <= 1: return 0 else: return -1
def function_for_remove_duplicates(self, similar_column='full_name', threshold=2): """ Function for process data, remove duplicate people records. :param similar_column: column by which find duplicates in data :param threshold: maximum value for returned Levenshtein distance between two samples in data :return: data without duplicates in similar_column. """ dupl_indexes = [] rows_number = self.unique_data.shape[0] for i in tqdm(range(rows_number - 1)): distances = np.array([ levenshtein_distance( self.unique_data[similar_column].values[i], self.unique_data[similar_column].values[j]) for j in range(i + 1, rows_number) ]) matching_indexes = np.where(distances <= threshold)[0] matching_indexes = matching_indexes + i + 1 d_b1 = self.unique_data['date_of_birth'].iloc[i] dupl_indexes += [ self.unique_data.index[match] for match in matching_indexes if self.unique_data['date_of_birth'].iloc[match] == d_b1 ] clean_df = self.unique_data.copy() for index_list in dupl_indexes: clean_df = clean_df.drop(index_list) return clean_df
def create_random_pairs(positive_instances, positive_pairs_all_datasets, existing_negatives): random.seed(42) # holds the Levenshtein distance of each concept pair distances = [] # tracks already created negative pairs as tuples, i.e. (l1,l2), to avoid duplicate creation new_negative_pairs = [] for i, row in tqdm(positive_instances.iterrows(), total=positive_instances.shape[0]): label1 = row['source'] # initialise random index random_index = i # make sure that no term pair duplicates or reverse duplicates are created # comparing to both positive and negative concept pairs while random_index == i or\ is_existing_pair(positive_pairs_all_datasets, label1, label2) or\ is_existing_pair(existing_negatives, label1, label2) or\ (label1, label2) in new_negative_pairs or (label2, label1) in new_negative_pairs\ or label1.lower() == label2.lower(): # choose a new random index and source vs target and get a new pairing term random_index = random.randint(0, positive_instances.shape[0] - 1) source_or_target = random.choice(['source', 'target']) label2 = positive_instances.loc[random_index][source_or_target] distances.append(levenshtein_distance(label1.lower(), label2.lower())) new_negative_pairs.append((label1, label2)) return new_negative_pairs, distances
def lambda_handler(event, context): """Get Levenshtein Distance Lambda Handler Parameters ---------- event: dict, required API Gateway Lambda Proxy Input Format Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format context: object, required Lambda Context runtime methods and attributes Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html Returns ------ API Gateway Lambda Proxy Output Format: dict Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html """ return { "statusCode": 200, "body": json.dumps( {"result": levenshtein_distance(event['nombre'], event['x'])}), }
def levenshtein_distance_norm(str1, str2): ''' 归一化的编辑距离, 越小越好 @return [0, 1] ''' max_len = max(len(str1), len(str2), 1) return levenshtein_distance(str1, str2) / max_len
def _parse_comorbs_sql(self, comorbs: pd.DataFrame, conflicts: str, edit_threshold: int): for x in comorbs.comorb_name.unique(): if x not in self._comorb_keys: similar = list( filter( lambda k: levenshtein_distance(k, x) >= edit_threshold, self._comorb_keys)) if len(similar) > 1: err = f"{x} conflicts with more than one comorbidity keys: {similar}" if conflicts == "raise": raise ValueError(err) warn( f"{err}; ignoring conflict, comorbiditiy {x} will be skipped" ) comorbs = comorbs[comorbs.comorb_name != x] elif len(similar) == 1: err = f"{x} conflicts with existing comorbidity {similar[0]}" if conflicts == "merge": warn(f"{err}; merging conflict with existing value") comorbs["comorb_name"] = comorbs["comorb_name"].apply( lambda x: similar[0]) elif conflicts == "raise": raise ValueError(err) else: warn( f"{err}; ignoring conflict, comorbiditiy {x} will be skipped" ) else: self._comorb_keys.append(x) curr = self._config.db_connection.cursor() curr.execute( "INSERT INTO ComorbKey (comorb_name) VALUES (?)", x) self._config.db_connection.commit() return comorbs
def calc_batch_levenshtein(predicted_word_idx, targets, vocab: Vocabulary, verbose=False): predicted_word_idx = list(predicted_word_idx.cpu().numpy()) targets = list(targets.cpu().numpy()) distances = [] for index, predicted_sentence_word_idx in enumerate( predicted_word_idx): sentence_to_str = TrainingUtils.word_idx_to_caption_sentence( predicted_sentence_word_idx, vocab) target_sentence = TrainingUtils.word_idx_to_caption_sentence( targets[index], vocab) levenshtein_metric = levenshtein_distance(target_sentence, sentence_to_str) distances.append(levenshtein_metric) if verbose: print(f'\nPredicted: {sentence_to_str}') print(f'Target: {target_sentence}') print(f'Levenshtein distance: {levenshtein_metric}\n') verbose = False return np.mean(distances)
def find_DOB(dob1, dob2): if dob1 == 'U' or dob2 == 'U': return 1 elif levenshtein_distance(dob1, dob2) <= 1: return 0 else: return -1
def address(a1, a2): if a1 == a2: return True if not a1 or not a2: return False a1 = a1.lower() a2 = a2.lower() return levenshtein_distance(a1, a2) <= 3
def vendor(v1, v2): if v1 == v2: return True if not v1 or not v2: return False v1 = v1.lower() v2 = v2.lower() return levenshtein_distance(v1, v2) <= 0
def img2txt2(imgName, textFile, outfile): # read image img = cv2.imread(imgName) # real text realText = ' '.join(getText(textFile)) # segment character from image print("Segmenting image to characters ...") segmentedChars = img2Chars(img) # loading the model model = loadModel(MODEL_NAME) f = open(outfile, 'w') allText = "" for i in tqdm(range(len(segmentedChars))): currentWord = "" for c in segmentedChars[i][2]: prediction = model.predict([prepareCharImg(c)]) char = chars_decode[prediction[0]] currentWord += char allText += char allText += ' ' f.write(currentWord) f.write(' ') f.close() with open("hello.txt", 'w') as f: f.write(realText) f.write("\n\n\n\n") f.write(allText) print("\n\n accuracy:") print(len(allText), len(realText)) print(levenshtein_distance(realText, allText) / len(realText)) print("another accuracy") realList = realText.split(' ') allList = allText.split(' ') errors = 0 for i in range(len(realList)): errors += levenshtein_distance(realList[i], allList[i]) print(errors / len(realText))
def _get_similarity_score_levenshtein(self, new_words): # levenshtein edit distance based d = sum([ levenshtein_distance(self.words[idx], new_words[idx]) if self.words[idx] != '' else 0 for idx in range(self.nwords) ]) levenshtein_score = 1 - d / (sum(self._wordlens) + sum([len(w) for w in new_words])) return levenshtein_score
def getTranslationNear(msgid_to_search, percent): max_word_diff = 1 + int(percent * len(msgid_to_search)) possible_solutions = list() for msgid in datastore: dist = levenshtein_distance(msgid, msgid_to_search) if dist < max_word_diff: possible_solutions.append((dist, datastore[msgid])) possible_solutions.sort(key=lambda x: x[0]) return possible_solutions
def cer(self, s1, s2): """ Computes the Character Error Rate, defined as the edit distance. Arguments: s1 (string): space-separated sentence s2 (string): space-separated sentence """ s1, s2, = s1.replace(' ', ''), s2.replace(' ', '') return levenshtein_distance(s1, s2)
def products(p1, p2): #print(p1['name'].lower(),'---', p2['name'].lower()) n1 = p1['name'].lower() n2 = p2['name'].lower() if levenshtein_distance(n1, n2) <= 0: try: price1 = float(p1['price'].replace(',', '.')) except: return False if util.floatCompare(price1, float(p2['price'])): a1 = p1['amount'] a2 = p2['amount'] return a1 == a2 return False if levenshtein_distance(n1, n2) <= 0: return True #print(p1,'---',p2) return False
def get_distance(DataLoaderContainer, y_pred, y): y_greedy = torch.max(y_pred, dim=1)[1] y_pred_char = ''.join([ DataLoaderContainer.index_to_char[idx] for idx in y_greedy.detach().cpu() ]) y_true_char = ''.join( [DataLoaderContainer.index_to_char[idx] for idx in y.detach().cpu()]) return levenshtein_distance(y_pred_char, y_true_char)
def evaluate_text(message, goal_text, verbose=VERBOSE): """Given a Message and a goal_text string, return the Levenshtein distance between the Message and the goal_text as a length 1 tuple. If verbose is True, print each Message as it is evaluated. """ distance = levenshtein_distance(message.get_text(), goal_text) if verbose: print("{msg!s}\t[Distance: {dst!s}]".format(msg=message, dst=distance)) return (distance, ) # Length 1 tuple, required by DEAP
def getNearestWord(word, wordsList): bestWord = word distance = 1000000000 for w in wordsList: d = levenshtein_distance(word, w) if d < distance: distance = d bestWord = w return bestWord
def getDistance(self): with open(self.ipFile) as f: lines = f.readlines() print(lines[0]) print(lines[1]) self.distance = levenshtein_distance(lines[0].strip(), lines[1].strip()) fout = open(f'{self.opFile}', 'w') fout.write(f"{str(self.distance)}") fout.close()
def calculateDistance(row1, row2): sum = 0 for var in explVariables: if var not in stringFields: sum += (row1[var] - row2[var]) ** 2 elif var == 'Review Title': sum += (1-levenshtein_distance(row1[var], row2[var])/max(len(row1[var]), len(row2[var]))) elif var == 'titleSentiment' and row1[var] == row2[var]: sum += 1 return math.sqrt(sum)
def find_diff(line, input): next_line = 1 while next_line < len(input): if levenshtein_distance(line, input[next_line]) == 1: return line next_line += 1 return "None"
def process_message(msg, tabs, min_levenshtein_ratio, test_mode_prefix=False): for code in tabs: msgtr = translate(code, msg) dist = levenshtein_distance(msg, msgtr) ratio = dist / len(msg) if ratio > min_levenshtein_ratio: print(" code=%s ratio=%lf => %s" % (code, ratio, msgtr)) if test_mode_prefix: msgtr = test_mode_prefix + msgtr return msgtr return None
def crossref_is_similar(cr_info, bib_info, max_levenshtein_distance): is_similar = False if cr_parser.has_title(cr_info): entry_title = bib_parser.get_title(bib_info) entry_title = cleaner.clean_braces(entry_title) crossref_title = cr_parser.get_title(cr_info) lev_distance = levenshtein_distance(crossref_title, entry_title) if lev_distance <= max_levenshtein_distance: is_similar = True return is_similar
def compare_address(address1, address2): if address1 == 'U' or address2 == 'U': return 1 levenshtein_sum = 0 min_street = min(len(address1), len(address2)) for x in range(min(len(address1), len(address2))): # if we are comparing the last word compare the shortest combination if x == min_street - 1: min_word_length = min(len(address1[x]), len(address2[x])) temp1 = address1[x][:min_word_length] temp2 = address2[x][:min_word_length] levenshtein_distance(temp1, temp2) else: levenshtein_sum += levenshtein_distance(address1[x], address2[x]) if levenshtein_sum == 0: return 0 if min_street / levenshtein_sum < 1: return -1 else: return 0
def filter_name(raw_name, list_name): ld_list = [levenshtein_distance(e, raw_name) for e in list_name] min_ld = min(ld_list) filtered_name = "?" if (min_ld > 2): name_lenght = len(raw_name) if (name_lenght >= 30): print( colored( "[1] %s (= %s) min_ld = %d" % (raw_name, list_name[ld_list.index(min_ld)], min_ld), 'yellow')) # the name might be truncated: ld_list = [ levenshtein_distance(e[0:name_lenght], raw_name) for e in list_name ] min_ld = min(ld_list) if (min_ld > 4): print( colored( "[2] %s (= %s) min_ld = %d" % (raw_name, list_name[ld_list.index(min_ld)], min_ld), 'red')) else: print( colored( "[2] %s (= %s) min_ld = %d" % (raw_name, list_name[ld_list.index(min_ld)], min_ld), 'green')) filtered_name = list_name[ld_list.index(min_ld)] else: print( colored( "[1] %s (= %s) min_ld = %d" % (raw_name, list_name[ld_list.index(min_ld)], min_ld), 'red')) else: filtered_name = list_name[ld_list.index(min_ld)] #print("%s => %s" %(name, filtered_name)) return filtered_name
def calculateErrorRate(groundTruthPath="NewDataset/text/", predictedPath="OutputTextFiles/", statsFile="CER.txt"): with open(statsFile, 'w') as f: files = os.listdir(predictedPath) totalError = 0 for file in files: realText = ' '.join(getText(groundTruthPath + file)) predictedText = ' '.join(getText(predictedPath + file)) error = levenshtein_distance(realText, predictedText) / len(realText) totalError += error f.write(f"file: {file}\t\tCER: {error}\n") totalError /= len(files) f.write(f"Total CER: {totalError}")
def is_similar(value, strings, settings): """ Checks is a string is similar to one in a set of strings :param value: :param strings: :param settings: :return: """ for s in strings: if levenshtein_distance(value, s) < (len(value) / settings["s"]): return True return False
def filtered_similars(wordlist, forbidden): unfiltered = similar_list(wordlist) result = [] for s, dist in unfiltered: ok = True for parent in forbidden: if levenshtein_distance(parent, s) <= LEVENSHTEIN_THRESHOLD or parent in s: ok = False if ok: result.append((s, dist)) return result
def levenshtein(a, b): d = levenshtein_distance(a, b) / max(len(a), len(b)) return (1 - d)
def _execute(self, str1, str2): LDAlgorithm._execute(self, str1, str2) return levenshtein_distance(str1, str2)