def get_celldifference(self): conn = sqlite3.connect('ipython.db') c = conn.cursor() lines = c.execute('SELECT script, cell, line_content FROM ipython ') lines = list(lines) lines_copy = copy.copy(lines) cell_differences = {} repo_cell = "" for l in range(0, len(lines)): line = lines[l] line_diff = [] for c in range(l+1, len(lines_copy)): line_copy = lines_copy[c] if line[0] == line_copy[0] and l != c and line[1] != line_copy[1]: difference = distance.levenshtein(line[2], line_copy[2]) same = self.cell_analysis(len(line[2]), len(line_copy[2]), difference) line_diff.append(same) repo_cell = str(line[0]) + ": " + str(line[1]) if repo_cell in cell_differences: cell_differences[repo_cell].extend(line_diff) else: cell_differences[repo_cell] = line_diff return cell_differences
def kskip_ngram_cuoccurrence(answer, example, ng, kskip, question_index): distance_over_ngrams = 0 # nltk skipgrams function doesn't include unigrams, so they are calculated separately: summ_distance = 0 for word_ex in example: dist = 0 for word_an in answer: dist_tmp = distance.levenshtein(word_ex, word_an) if dist_tmp > dist: dist = dist_tmp summ_distance += dist try: distance_over_ngrams += summ_distance / len(example) except: print("zero") print(question_index) distance_over_ngrams += 0 for n in range(2, ng + 1): answer_ngramed = list(skipgrams(answer, n, kskip)) example_ngramed = list(skipgrams(example, n, kskip)) summ_distance = 0 for ngram_of_example in example_ngramed: dist = 0 for ngram_of_answer in answer_ngramed: dist_tmp = distance.levenshtein(ngram_of_example, ngram_of_answer) if dist_tmp > dist: dist = dist_tmp summ_distance += dist try: distance_over_ngrams += summ_distance / len(example_ngramed) except: distance_over_ngrams += 0 return distance_over_ngrams
def run_cal(): db = MySQLdb.connect(host="localhost", port=3308, user="******", passwd="你的密码", db="MovieSizer", charset="utf8") cursor = db.cursor() sql = "select * from movies_movieinfo" cursor.execute(sql) res = cursor.fetchall() id = 1 print('\n\n cal_similar_gry.py\n\n') # 控制矩阵规模 for i in range(0, int(len(res)/100)): for j in range(i+1, len(res)): i_id = res[i][0] j_id = res[j][0] moviename_length = distance.levenshtein(res[i][1], res[j][1]) nation_length = distance.levenshtein(res[i][3], res[j][3]) directors_length = distance.levenshtein(res[i][4], res[j][4]) leadactors_length = distance.levenshtein(res[i][5], res[j][5]) editors_length = distance.levenshtein(res[i][6], res[j][6]) length = moviename_length + nation_length + \ directors_length + leadactors_length + editors_length similar = 1/length sql = "INSERT INTO movies_moviesimilar VALUES (%d, %d, %d, %f)" % ( id, i_id, j_id, similar) cursor.execute(sql) db.commit() id = id + 1 print('current : ', i) db.close() print('DONE !')
def testLD(self): str1 = "hello my name is jose" str2 = "hello m yname is jose" str3 = "hlelo my name is jose" self.assertEquals(distance.levenshtein(str1, str2), 2) self.assertEquals(distance.levenshtein(str1, str3), 2) self.assertEquals(distance.levenshtein(str1, str2), \ distance.levenshtein(str2, str1))
def valid_text(text): text = clean_text(text) # print(text) if text == "": return False, '' p = re.compile('((\d{3})-(\d{3})-(\d{2}))') if p.search(text) is not None: found = p.search(text).group() print(f"Matched regex 1: {found}") return True, found p = re.compile('(LPN\d{8})') if p.match(text) is not None: # print(f"Matched regex 2: {p.match(text).group()}") return True, p.match(text).group() else: if len(text) > 3: min_distance_levenshtein = 2 index = text.find('L') if index >= 0: #print('trazas ',distance.levenshtein('LPN',text[max(index-2,0):index])) if distance.levenshtein( 'LPN', text[index:min(index + 2, len(text) )]) <= min_distance_levenshtein: text_aux = text[index + 3:-1] text_aux = 'LPN' + text_aux #print("NUEVO LPN L ", text_aux) if p.match(text_aux) is not None: # print(f"Matched regex 2: {p.match(text_aux).group()}") return True, p.match(text_aux).group() index = text.find('P') if index >= 0: #print('trazas ',distance.levenshtein('LPN',text[max(index-2,0):index])) if distance.levenshtein( 'LPN', text[max(index - 1, 0):min(index + 1, len(text))] ) <= min_distance_levenshtein: text_aux = text[index + 2:-1] text_aux = 'LPN' + text_aux #print("NUEVO LPN P ", text_aux) if p.match(text_aux) is not None: # print(f"Matched regex 2: {p.match(text_aux).group()}") return True, p.match(text_aux).group() index = text.find('N') if index >= 0: #print('trazas ',distance.levenshtein('LPN',text[max(index-2,0):index])) if distance.levenshtein( 'LPN', text[max(index - 2, 0):index]) <= min_distance_levenshtein: text_aux = text[index + 1:-1] text_aux = 'LPN' + text_aux #print("NUEVO LPN N ", text_aux) if p.match(text_aux) is not None: # print(f"Matched regex 2: {p.match(text_aux).group()}") return True, p.match(text_aux).group() return False, ''
def predict_oov_v2(self, n): if len(self.masked_index) != 0: print('Masked sentence ' + ' '.join(self.tokenized_text) + ' and the masked index is ' + str(self.masked_index)) indexed_tokens = self.tokenizer.convert_tokens_to_ids( self.tokenized_text) segments_ids = [0] * len(self.tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) #print (indexed_tokens) with torch.no_grad(): outputs = self.model(tokens_tensor, token_type_ids=segments_tensors) predictions_1 = outputs[0] #predicted_index = torch.argmax(predictions_1[0, masked_index]).item() bestsent = self.tokenized_text # looping the masked index for mei in self.masked_index: sorted_index = torch.argsort(predictions_1[0, mei], descending=True).tolist() sorted_probability = [ predictions_1[0, mei][i] for i in sorted_index ] returnstr = [] for loop in range(n): predicted_token = self.tokenizer.convert_ids_to_tokens( [sorted_index[loop]])[0] returnstr.append(predicted_token) #returnstr.append(' '.join(self.tokenized_text).replace('[MASK]', predicted_token)) #print(predicted_token) bestd = 100 bestw = '' for v in returnstr: #print(v+' ' +str(distance.levenshtein(v, self.original_sentence[mei]))) if distance.levenshtein( v.lower(), self.original_sentence[mei].lower()) < bestd: bestd = distance.levenshtein( v.lower(), self.original_sentence[mei].lower()) bestw = v bestsent[mei] = bestw print('Recovered sentence ' + ' '.join(bestsent)) return ' '.join(bestsent), self.get_score(bestsent) else: return ' '.join(self.tokenized_text), self.get_score( self.tokenized_text)
def compare_mt(usertrans, referencetrans, machinetrans): """ Compare if user translation better or worst than machine translation """ evaluation = {'better': ['Congratulations, you did better than the machine translation!', \ 'Be proud, you were better than the machine translation!', \ 'You are the best, even better than the machine translation!'], \ 'same': ['You did as good as the machine translation!', \ 'This is a tie between you and the machine translation!', \ 'The machine translation was as good as you!'], \ 'worst': ["The machine translation beat you, let's try to do better!", \ "What a shame, you were defeated by the machine translation.", \ "Next time, you will beat the machine translation, but not this time!"]} # encode sentences to UTF-8 ut = usertrans.encode('utf8') tt = referencetrans.encode('utf8') mt = machinetrans.encode('utf8') # remove punctuation replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation)) ut = ut.translate(replace_punctuation).lower() tt = tt.translate(replace_punctuation).lower() mt = mt.translate(replace_punctuation).lower() # levensthein characters ratio lev_let_ut = lev.ratio(tt, ut) lev_let_mt = lev.ratio(tt, mt) # levensthein tokens ratio ut = ut.split() tt = tt.split() mt = mt.split() lensum_user = len(ut)+len(tt) lensum_machine = len(mt)+len(tt) lev_tok_ut = (lensum_user - distance.levenshtein(tt, ut)) / lensum_user lev_tok_mt = (lensum_machine - distance.levenshtein(mt, ut)) / lensum_machine # get best levensthien ratio ratio_ut = max(lev_let_ut, lev_tok_ut) ratio_mt = max(lev_let_mt, lev_tok_mt) # evaluate if user better, worst or similar than machine if ratio_ut > ratio_mt: return random.choice(evaluation['better']) elif ratio_ut < ratio_mt: return random.choice(evaluation['worst']) else: return random.choice(evaluation['same'])
def compare_sent(self, sent_true: str, sent_pred: str): ''' :param sent_true: :param sent_pred: :return: int value ''' return distance.levenshtein( sent_true, sent_pred), distance.levenshtein(sent_true, sent_pred, normalized=True)
def get_closest_word(ww, wset): bestd = 100 bestw = '' for v in wset: #print(v+' ' +str(distance.levenshtein(v, self.original_sentence[mei]))) if distance.levenshtein(v.lower(), ww.lower()) < bestd: bestd = distance.levenshtein(v.lower(), ww.lower()) bestw = v return bestw
def compare_lex(item1, item2): global search_word if search_word == item1["point headline"]: return 1 if search_word == item2["point headline"]: return -1 else: if levenshtein(item1["point headline"], search_word) < levenshtein( item2["point headline"], search_word): return 1 else: return -1
def getIDs(name, players, ids): best = 100000 bestIndex = -1 for player in players: if distance.levenshtein(name, player) < best: best = distance.levenshtein(name, player) bestIndex = players.index(player) if best > 5: print(name) print(ids[bestIndex]) return ids[bestIndex]
def test_levenshteinBase(self): a = '' b = '' self.assertAlmostEqual(distance.levenshtein(a, b), 0, places=3) a = 'abc' b = 'abc' self.assertAlmostEqual(distance.levenshtein(a, b), 0, places=3) a = 'abcd' b = 'abc' self.assertAlmostEqual(distance.levenshtein(a, b), 0.25, places=3) a = 'abc' b = 'abd' self.assertAlmostEqual(distance.levenshtein(a, b), 0.33333, places=3) a = 'ab' b = 'abc' self.assertAlmostEqual(distance.levenshtein(a, b), 0.33333, places=3) a = '' b = 'abc' self.assertAlmostEqual(distance.levenshtein(a, b), 1, places=3) a = 'abc' b = '' self.assertAlmostEqual(distance.levenshtein(a, b), 1, places=3) a = 'abc' b = 'xyz' self.assertAlmostEqual(distance.levenshtein(a, b), 1, places=3)
def calculateSimilarity_WithDistancePackage(createdSentence): #createdSentence is list of words levinDist = {} jaccardDist = {} bestValues = {} for i in range(len(X_train)): currSentence = X_train[i] sentence_str = [index_to_word[x] for x in currSentence[1:-1]] # sentence_str is list of words #Levinstein Distance dist = distance.levenshtein(createdSentence, sentence_str) dist2 = distance.jaccard(createdSentence,sentence_str) #print(dist) if (dist>0): #print ("Distance Levinshtein: %f" % (dist)) levinDist[i]=dist jaccardDist[i]=dist2 #print ("Jaccard Distance: %f" % (dist2)) #take best value levinMin = min(levinDist.itervalues()) jaccardMin = min(jaccardDist.itervalues()) print ("Best Distance Levinshtein: %f" % (levinMin)) print ("Best Distance Jaccard: %f" % (jaccardMin)) bestValues["Jaccard"]=jaccardMin bestValues["Levin"]=levinMin return bestValues
def main(args): parameters = process_args(args) target_formulas_file = parameters.target_file predicted_formulas_file = parameters.predicted_file target_formulas = open(target_formulas_file).readlines() predicted_formulas = open(predicted_formulas_file).readlines() i = 0 total_len = 0 total_edit_distance = 0 if len(target_formulas) != len(predicted_formulas): print("number of formulas doesn't match") return n = len(target_formulas) for tf, pf in zip(target_formulas, predicted_formulas): i += 1 if i % 2000 == 0: print("{}/{}".format(i, n)) tf_ = tf.strip() pf_ = pf.strip() true_token = tf_.split(' ') predicted_tokens = pf_.split(' ') l = max(len(true_token), len(predicted_tokens)) edit_distance = distance.levenshtein(true_token, predicted_tokens) total_len += l total_edit_distance += edit_distance print("{}/{}".format(n, n)) print('Edit Distance Accuracy: %f' % (1. - float(total_edit_distance) / total_len))
def cluster_similar_responses(output_path): max_count = get_max_socket_message_count(output_path) listing = glob.glob(output_path + '*-%s.log' % max_count) messages = [file(filename).read() for filename in listing] messages = [extract_description_from_message(m) for m in messages] messages = np.asarray(messages) print() print('Clustering %s responses...(this might take a while)' % len(messages)) print() lev_similarity = -1 * np.array( [[distance.levenshtein(m1, m2) for m1 in messages] for m2 in messages]) affprop = sklearn.cluster.AffinityPropagation(affinity='precomputed', damping=0.5) affprop.fit(lev_similarity) print('Generated clusters:') print() for cluster_id in np.unique(affprop.labels_): exemplar = messages[affprop.cluster_centers_indices_[cluster_id]] cluster = np.unique( messages[np.nonzero(affprop.labels_ == cluster_id)]) cluster_str = ', '.join(cluster) print('-' * 80) print(' - *%s:* %s' % (exemplar, cluster_str)) print('-' * 80) print()
def dist(word1, word2): # определяем схожесть 2-х треков import distance #print distance.hamming(word1, word2, normalized=True) # не можем использовать, так как это расстояние рассчитывается только для строк одинаковой длины return distance.levenshtein(word1, word2, normalized=True) #минимальное количество операций вставки одного символа, удаления одного символа и замены одного символа на другой, необходимых для превращения одной строки в другую
def similarity(self, word1, word2): # Score function may vary and for sure can be improved # I used distance module here ldist = distance.levenshtein(word1, word2) hdist = 0 #distance.hamming(word1, word2) requires equal length e.g. padding score = ldist * 1 # + hdist * 0 return score
def initUI(self): inputGroups = self.inputText.split() solutionGroups = self.solutionText.split() numLetters = 0.0 numErrors = 0.0 for idx, _ in enumerate(solutionGroups): if idx >= len(inputGroups): inputGroups.append("") numLetters += len(solutionGroups[idx]) numErrors += levenshtein(solutionGroups[idx], inputGroups[idx]) percentage = numErrors / numLetters * 100.0 solutionLabel = QLabel(self.createEvaluationRichText(solutionGroups, inputGroups)) errorLabel = QLabel("Error count (Levenshtein): %02.2f%%" % percentage) layout = QVBoxLayout() layout.addWidget(solutionLabel) layout.addWidget(errorLabel) if percentage < 10.0: successLabel = QLabel("Error rate lower than 10%! <br/> Proceed to next lesson!") layout.addWidget(successLabel) self.setLayout(layout) self.setWindowTitle('Evaluation')
def similarity(self, word1, word2): # Score function may vary and for sure can be improved # I used distance module here ldist = distance.levenshtein(word1, word2) hdist = 0 #distance.hamming(word1, word2) requires equal length e.g. padding score = ldist * 1# + hdist * 0 return score
def checkNames(df): df = df[df['mins'] >= 15] df['hours'] = 1. / 60. * df['mins'] res = df.groupby('name')['hours'].agg(np.sum) nPart = len(res) print('Helenka') names = res.index.tolist() import sklearn.cluster import distance #words = "YOUR WORDS HERE".split(" ") #Replace this line words = names[0:100] words = np.asarray(words) #So that indexing with a list will work lev_similarity = -1 * np.array( [[distance.levenshtein(w1, w2) for w1 in words] for w2 in words]) print(lev_similarity) vals = [] for idx, x in np.ndenumerate(lev_similarity): vals.append([x, idx]) #print(idx, x) vals = sorted(vals, key=lambda s: s[0]) for v in vals: print(v[0], words[v[1][0]], ' , ', words[v[1][1]])
def ubica(bot, update, user_data): conn = sqlite3.connect('aemet.db') c = conn.cursor() sitio = update.message.text.title() c.execute('SELECT * FROM municipio WHERE nombre=?', (sitio, )) base = c.fetchone() if base is None: update.message.reply_text( 'Parece que no está bien escrito, déjame ver...') c.execute('SELECT * FROM municipio') flag = 0 i = 0 while i < 9208: base = c.fetchone() if distance.levenshtein(unidecode(sitio.lower()), unidecode(base[0].lower()), normalized=True) < 0.2: flag = 1 break i += 1 if flag == 0: update.message.reply_text( 'Ubicación no encontrada. Por favor, indica el municipio que quieres buscar:' ) return LUGAR cod[update.message.chat.id] = base[1] update.message.reply_text("Se ha elegido el municipio %s, en %s" % (base[0], base[2]), reply_markup=markup) place[update.message.chat.id] = base[1] return CHOOSING
def inlevenshtein(seq1, seqs, max_dist=0.1): for seq2 in seqs: dist1 = distance.levenshtein(seq1, seq2, max_dist=2) if dist1 !=-1: dist2 = distance.nlevenshtein(seq1, seq2, ) if dist2 <= max_dist: yield dist2, seq2
def distance_calculating(train_tuple, test_tuple): """ Calculate the Levenshtein distance between two items and return a dictionary with the index of closest matches as a list as its value and the n_test item as the key. This function returns a dictionary of the form: n_test: [distance, [n_train...n_train]] """ data = dict() for n_test, test_item in enumerate(test_tuple): # set default value for the data dict. data.setdefault(n_test,[-1,[]]) # remove the class label test_item = test_item[:-1] for n_train, train_item in enumerate(train_tuple): # remove the class label train_item = train_item[:-1] min_distance = distance.levenshtein(test_item, train_item) # if there is an exact match if min_distance == 0: data[n_test] = [min_distance, [n_train]] break elif min_distance == data[n_test][0]: data[n_test][1].append(n_train) elif min_distance < data[n_test][0] or data[n_test][0] == -1: data[n_test] = [min_distance, [n_train]] return data
def knn(test, orig, k): index = [] dist = [] ct = 1 y = test[0:8] oc = test[8] for i in orig: x = i[0:8] dist.append(distance.levenshtein(test, x)) index.append(i[8]) dist, index = pairsort(dist, index) cp = {} for i in dist: if ct <= k: if index[ct - 1] in cp.keys(): cp[index[ct - 1]] += 1 else: cp[index[ct - 1]] = 1 else: break ct += 1 predicted = max(cp.items(), key=operator.itemgetter(1))[0] return predicted, oc
def calcDist(Mots, AllWordsList, nbSyll): nb9 = nbSyll / 9 # on calcule leur distance de Levenshtein min avec la base WordDist = [] for i, mot in enumerate(Mots): # pseudo-mots minD = 1000000 minWord = '' nbD = 0 for word in AllWordsList: # base d = distance.levenshtein(mot, word) if d < minD: minD = d minWord = word nbD = 1 elif d == minD: nbD += 1 WordDist.append([mot, minWord, minD, nbD]) distList = [x[2] for x in WordDist] # nombre le plus commun [dist, nb] = Counter(distList).most_common(1)[0] if nb < 12: return -1 L1 = distList[0:nb9] L2 = distList[nb9:2 * nb9] L3 = distList[2 * nb9:3 * nb9] if L1.count(dist) < 4 or L2.count(dist) < 4 or L3.count(dist) < 4: return -1 L1Res = [x for x in WordDist[0:nb9] if x[2] == dist] L2Res = [x for x in WordDist[nb9:2 * nb9] if x[2] == dist] L3Res = [x for x in WordDist[2 * nb9:3 * nb9] if x[2] == dist] return L1Res + L2Res + L3Res
def calculateSimilarity_WithDistancePackage( createdSentence): #createdSentence is list of words levinDist = {} jaccardDist = {} bestValues = {} for i in range(len(X_train)): currSentence = X_train[i] sentence_str = [index_to_word[x] for x in currSentence[1:-1] ] # sentence_str is list of words #Levinstein Distance dist = distance.levenshtein(createdSentence, sentence_str) dist2 = distance.jaccard(createdSentence, sentence_str) #print(dist) if (dist > 0): #print ("Distance Levinshtein: %f" % (dist)) levinDist[i] = dist jaccardDist[i] = dist2 #print ("Jaccard Distance: %f" % (dist2)) #take best value levinMin = min(levinDist.itervalues()) jaccardMin = min(jaccardDist.itervalues()) print("Best Distance Levinshtein: %f" % (levinMin)) print("Best Distance Jaccard: %f" % (jaccardMin)) bestValues["Jaccard"] = jaccardMin bestValues["Levin"] = levinMin return bestValues
def get_conceptual_independence(data): comm = [] count = 0 comm_data = data["clusters"] for i in comm_data: temp = [] for j in i["nodes"]: if find_node_type(data, j): name = find_node(data, j) name = name.split(".")[-1] temp.append(name) comm.append(temp) community_score = [] for i in comm: score = 0 words = np.asarray(i) for j in words: for k in words: # print (distance.levenshtein(j, k)/max(len(j), len(k))) score += 1 - distance.levenshtein(j, k) / max( len(j), len(k)) community_score.append(score / len(i)**2) total_score = 0 for i in community_score: total_score += i total_score = total_score / len(community_score) return total_score
def get_repodifference(self): conn = sqlite3.connect('ipython.db') c = conn.cursor() lines = c.execute( 'SELECT repository, script, line_content FROM ipython ') lines = list(lines) lines_copy = copy.copy(lines) repo_differences = {} for l in range(0, len(lines)): line = lines[l] line_diff = [] for c in range(l + 1, len(lines_copy)): line_copy = lines_copy[c] if line[0] != line_copy[0] and l != c and line[1] != line_copy[ 1]: difference = distance.levenshtein(line[2], line_copy[2]) same = self.cell_analysis(len(line[2]), len(line_copy[2]), difference) line_diff.append(same) repo_cell = str(line[0]) + ": " + str(line[1]) if repo_cell in repo_differences: repo_differences[repo_cell].extend(line_diff) else: repo_differences[repo_cell] = line_diff return repo_differences
def get_title(author, talk_names): for name in talk_names: if name.startswith(author): matches = re.search(r'(.*) "(.*)"', name) if matches: return matches.group(2) else: return name # Use levenshtein's distance to get the closes title min_title = '' min_distance = float('inf') for subauth, subtitle in filter(lambda x: x[0], map(get_author, talk_names)): cur_dis = distance.levenshtein(subauth, author) if cur_dis < min_distance: min_distance = cur_dis min_title = subtitle if min_title: return min_title else: return 'not_found. Is it no educativa?'
def assign_third(first, second, tags): first_out = [] second_out = [] third_out = [] B = len(first) attempts = min(B-1,ATTEMPTS) for i in range(B): ti = first[i] si = second[i] hi, tidi, sidi = tags[i] flag = attempts checked = [] while (flag): j = random.randrange(B) if j in checked: continue checked.append(j) flag -= 1 hj = tags[j][0] if distance.levenshtein(hi, hj) > MIN_LEV_DIST: if (random.getrandbits(1)): tj = first[j] newdi = tags[j][1] else: tj = second[j] newdi = tags[j][2] if (newdi != tidi ) & (newdi != sidi): first_out.append(ti) second_out.append(si) third_out.append(tj) flag = 0 return (first_out, second_out, third_out)
def RatingLevenshtein(revista): #Get path file dirname = os.path.dirname(__file__) loc = os.path.join(dirname, r'JCR2018.xlsx') #Initialize reader workbook = xlrd.open_workbook(loc) sheet = workbook.sheet_by_index(0) tuplas = [] start_time = time() for i in range(sheet.nrows): valor = (sheet.cell_value(i, 1), levenshtein(revista, sheet.cell_value(i, 1))) tuplas.append(valor) final_time = time() execution_time = round(final_time - start_time, 2) tuplas.sort(key=lambda revista: revista[1]) top_5 = tuplas[:10] result = (top_5, execution_time) return result
def score(self): count = [] j_coeff = [] j_coeff_feat = [] l_dist = [] # pred_feat_list = [] # pred_feat_accuracy = [] bleu_score = [] for i in range(self.df.shape[0]): curr_feat = np.array([self.df.iloc[i, 0:4]]) path, label, decision_feature = self.predict(curr_feat) print('actual vs predicted: ', self.df.iloc[i, 4], ' vs ', ' '.join(path), 'labels: ', self.df.iloc[i, 5], label[0]) count.append(self.df.iloc[i, 5] == label[0]) actual_path = self.df.iloc[i, 4].split() actual_path_tok = [self.char_indices[char] for char in actual_path] pred_path_tok = [self.char_indices[char] for char in path] # print('actual_path--', actual_path) # print('path--', path) bleu_score.append(sentence_bleu([actual_path], path)) j_coeff.append(self.get_j_coeff(actual_path_tok, pred_path_tok)) j_coeff_feat.append( self.get_j_coeff(self.df.iloc[i, 6], decision_feature)) l_dist.append( distance.levenshtein(self.df.iloc[i, 4].replace(' ', ''), ''.join(path))) print('Actual vs predicted features: ', self.df.iloc[i, 6], 'vs', decision_feature, '\n') print('\nLabel accuracy - ', np.mean(count)) print('Path metric (Jaccard) - ', np.mean(j_coeff)) print('Path metric (Levenshtein) - ', np.mean(l_dist)) print('Decision feature metric (Jaccard) - ', np.mean(j_coeff_feat)) print('Bleu score of paths - ', np.mean(bleu_score))
def edit_distance(references, hypotheses): d_leven, len_tot = 0, 0 for ref, hypo in zip(references, hypotheses): d_leven += distance.levenshtein(ref, hypo) len_tot += float(max(len(ref), len(hypo))) return 1. - d_leven / len_tot
def update1(k1,k2): i=0 while i<len(k1): t1=k1[i]['T'] j=0 flag=0 while j<len(k2): k=0 t2=k2[j]['T'] #for word1 in t1: # dist=0 # for word2 in t2: # dist+=distance.lebenshtein(word1,word2) dist=int(distance.levenshtein(t1,t2)) #print dist if dist<=18: flag=1 break #print dist j=j+1 #minsum=su #k=j i=i+1
def align(sen1, sen2): """finds the best mapping of words from one sentence to the other""" #find lengths sen1 = list(map(preprocess_word, sen1.split())) sen2 = list(map(preprocess_word, sen2.split())) lengthDif = len(sen1) - len(sen2) if lengthDif > 0: shorter = sen2 longer = sen1 else: shorter = sen1 longer = sen2 lengthDif = abs(lengthDif) shorter += ["emptyWord"] * lengthDif #create matrix matrix = np.zeros((len(longer), len(longer))) for i in range(len(longer)): for j in range(len(longer) - lengthDif): matrix[i,j] = distance.levenshtein(longer[i], shorter[j]) print(matrix) #compare with munkres m = Munkres() indexes = m.compute(matrix) print("mapping is:",[(shorter[i], longer[j]) for (i,j) in indexes])
def comp_string(self): # self.comp_string() allOriginalWords = [] allNLD = [] # rec = j[count][wordPos].split() #j is entry, count is dict key, wordPos is sentence called for i in self.original: split = i.split() # this splits one whole sentence to words allOriginalWords.append(split) for i in allOriginalWords: groupNLD = [] allNLD.append(groupNLD) for i in self.recognized: count = 0 #print("Self.recognized", self.recognized) for j in range(25): #print(j) sentence = i[j].split() #print(allOriginalWords[j]) #print(sentence) ld = d.levenshtein(allOriginalWords[j], sentence) nld = 0 nld = ld /( max(len(allOriginalWords[j]), len(sentence))) allNLD[j].append(nld) # print(nld) count += 1 for i in allNLD: s.distances.append(i) #print(i) print("finished distance NLD")
def analyse_titles(author, liste_titles, dict_titles2ark, zipname, outputfile): """ Si un auteur a plusieurs titres associés : on compare chaque titre avec chacun des titres suivants dans la liste en ne tenant pas compte des variantes sur les chiffres """ i = 0 for title in liste_titles[i:]: title_chars = clean_titles_int(title) for foll_title in liste_titles[i + 1:]: foll_title_chars = clean_titles_int(foll_title) dist = distance.levenshtein(title, foll_title) if ((dist == 1 or dist == 2) and title_chars != foll_title_chars): nbark_title1 = len(dict_titles2ark[title]) nbark_title2 = len(dict_titles2ark[foll_title]) ark_title = " ".join(list(dict_titles2ark[title])) ark_foll_title = " ".join(list(dict_titles2ark[foll_title])) if nbark_title1 > nbark_title2: line = [ str(author), ark_title, title, ark_foll_title, foll_title, str(dist), zipname ] line2report(line, outputfile) else: line = [ str(author), ark_foll_title, foll_title, ark_title, title, str(dist), zipname ] line2report(line, outputfile) i += 1
def assign_third(first, second, tags): first_out = [] second_out = [] third_out = [] B = len(first) attempts = min(B - 1, ATTEMPTS) for i in range(B): ti = first[i] si = second[i] hi, tidi, sidi = tags[i] flag = attempts checked = [] while (flag): j = random.randrange(B) if j in checked: continue checked.append(j) flag -= 1 hj = tags[j][0] if distance.levenshtein(hi, hj) > MIN_LEV_DIST: if (random.getrandbits(1)): tj = first[j] newdi = tags[j][1] else: tj = second[j] newdi = tags[j][2] if (newdi != tidi) & (newdi != sidi): first_out.append(ti) second_out.append(si) third_out.append(tj) flag = 0 return (first_out, second_out, third_out)
def __init__(self, mapping=None, weights=None): self._distanceMetrics = { 'euclidean': lambda a, b: distance.euclidean([a], [b]), 'manhattan': distance.manhattanScalar, 'levenshtein': lambda a, b: distance.levenshtein([a], [b]), 'needleman_wunsch': lambda a, b: distance.needleman_wunsch([a], [b]), 'jaccard': distance.jaccard, 'dice': distance.dice } self._mapping = mapping if (self._mapping == None): self._mapping = [None] * NUM_FEATURES self._mapping[STARS] = self._distanceMetrics['manhattan'] self._mapping[TOTAL_REVIEW_COUNT] = self._distanceMetrics[ 'manhattan'] self._mapping[AVAILABLE_REVIEW_COUNT] = self._distanceMetrics[ 'manhattan'] self._mapping[MEAN_REVIEW_LEN] = self._distanceMetrics['manhattan'] self._mapping[MEAN_WORD_LEN] = self._distanceMetrics['manhattan'] self._mapping[NUM_WORDS] = self._distanceMetrics['manhattan'] self._mapping[MEAN_WORD_COUNT] = self._distanceMetrics['manhattan'] self._mapping[TOTAL_HOURS] = self._distanceMetrics['manhattan'] self._mapping[ATTRIBUTES] = self._distanceMetrics['jaccard'] self._mapping[CATEGORIES] = self._distanceMetrics['jaccard'] self._mapping[TOP_WORDS] = self._distanceMetrics['jaccard'] self._mapping[KEY_WORDS] = self._distanceMetrics['jaccard'] self._mapping[OPEN_HOURS] = self._distanceMetrics['jaccard'] self._weights = weights if (self._weights == None): self._weights = [1] * NUM_FEATURES
def ajax(): if not request.json: abort(400) else: ingredient1 = request.json.get("ingredient1") ingredient2 = request.json.get("ingredient2") edit_distance = distance.levenshtein(ingredient1, ingredient2) return jsonify({"distance": edit_distance})
def leven(self): all_dis = [] small_list = Strings_comp(self).clear_data() for i in range(0, len(small_list)-1): cursor = small_list[i] for j in range(i+1, len(small_list)-1): dis = distance.levenshtein(cursor, small_list[j]) all_dis.append(dis) return all_dis
def successful(entry, word_list=word_list): new_word_list = [] for address, subtree in sorted(entry.lexical, key=lambda (addr,subt): addr): new_word = subtree.symbol if new_word not in word_list: return False new_word_list.append(new_word) return levenshtein(new_word_list, word_list) == 0
def test_similarity(sentence): if sentence in similarity_memo: return similarity_memo[sentence] similarity = distance.levenshtein(target_sentence, sentence) similarity_memo[sentence] = similarity return similarity
def meaning_answer_qual(self, entered): qual = 0 for meaning in word.meaning: meaning = normalize(meaning) # arbitrary ok_dist = round(0.4*len(meaning)) dist = distance.levenshtein(entered, meaning) qual = max(qual, 2 if dist == 0 else 1 if dist <= ok_dist else 0) return qual
def levenhstein_cluster(words): words = np.asarray(words) #So that indexing with a list will work lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words]) affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5) affprop.fit(lev_similarity) for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)]) cluster_str = ", ".join(cluster) print(" - *%s:* %s" % (exemplar, cluster_str))
def wikiMatches(word, maxDistance=5): import distance import re word = word.lower() response = keywordSearch(word, maxHits = 20) results = response["results"] for result in results: cleanLabel = re.sub(r'\(.*\)$', '', result["label"]).lower().strip() result["labelDistance"] = distance.levenshtein(word, cleanLabel) return [result for result in results if result["labelDistance"] <= maxDistance]
def get_close_matches(self,test_string,n=5,cutoff=0.5): candidate_keys = difflib.get_close_matches(test_string.lower(),self.long_titles.keys(),n=n,cutoff=cutoff) candidates = [] scores = [] for candidate_key in candidate_keys: candidate = self.long_titles[candidate_key] candidates.append(candidate) scores.append(1.0-distance.levenshtein(test_string,candidate)) return zip(candidates,scores)
def edit_distance(): if not request.json: raise InvalidUsage('Requires JSON') data = request.get_json() edit_distance = levenshtein(data['word1'], data['word2']) response = { 'word1': data['word1'], 'word2': data['word2'], 'edit_distance': edit_distance } return jsonify(**response)
def search(self, term, k, results=None): if results is None: results = [] d = distance.levenshtein(self.term, term) counter = 1 if d <= k: results.append(self.term) for i in range(max(0, d - k), d + k + 1): child = self.children.get(i) if child: counter += child.search(term, k, results) return counter
def build_json(df): node_list = [] edge_list = [] name_list = list(df.name) for name in name_list: node_list.append({'name': name}) for i in range(0,len(name_list)): for j in range(i,len(name_list)): d = distance.levenshtein(name_list[i],name_list[j]) if d < 3 and j!=i: edge_list.append({'source':i,'target':j,'value':d+1}) n = {'nodes': node_list, 'edges': edge_list} return n
def calculateDistance(word_array,vocabulary): distance_score_min_levenshstein = {} distance_score_sum_levenshstein = {} for voc in vocabulary: temp_score_lev = 9999 mean_score = 0 for word in word_array: score = distance.levenshtein(word,voc); if temp_score_lev > score: temp_score_lev = score distance_score_min_levenshstein[voc] = temp_score_lev return distance_score_min_levenshstein
def extract(self, query, document, params=None): ''' Extracts features for a given pair of query and document Args: query(str): The query string document(str) params: parameters for the feature extractor Returns float -- feature value ''' return distance.levenshtein(self.tokenize(query), self.tokenize(document), normalized=True)
def align(sen1, sen2, string=True): """finds the best mapping of words from one sentence to the other string = a boolean represents if sentences are given as strings or as list of ucca terminal nodes returns list of word tuples and the corresponding list of indexes tuples""" if string: sen1 = list(map(preprocess_word, sen1.split())) sen2 = list(map(preprocess_word, sen2.split())) else: sen1 = [preprocess_word(terminal.text) for terminal in sen1] sen2 = [preprocess_word(terminal.text) for terminal in sen2] #find lengths length_dif = len(sen1) - len(sen2) if length_dif > 0: shorter = sen2 longer = sen1 switched = False else: shorter = sen1 longer = sen2 switched = True length_dif = abs(length_dif) shorter += ["emptyWord"] * length_dif #create matrix matrix = np.zeros((len(longer), len(longer))) for i in range(len(longer)): for j in range(len(longer) - length_dif): matrix[i,j] = distance.levenshtein(longer[i], shorter[j]) + float(abs(i-j))/len(longer) #compare with munkres m = Munkres() indexes = m.compute(matrix) #remove indexing for emptywords and create string mapping refactored_indexes = [] mapping = [] start = 0 if string else 1 for i, j in indexes: if j >= len(longer) - length_dif: j = -1 - start if switched: refactored_indexes.append((j + start, i + start)) mapping.append((shorter[j], longer[i])) else: refactored_indexes.append((i + start, j + start)) mapping.append((longer[i], shorter[j])) return mapping, refactored_indexes
def getSuggestedWords(word, wordFrequencies): suggestedWords = [] print word for item in wordFrequencies: possibleWord = item[0] #quick check for possibility based on length if abs(len(word) - len(possibleWord)) > 2: continue #compute levenshtein distance editDistance = distance.levenshtein(word, possibleWord) if editDistance <= 2: suggestedWords.append(possibleWord) return suggestedWords
def findWordVariants(tset, wordVariants): dist = 0 for ind in range(len(wordVariants)): #its a set entry = wordVariants[ind] dist = 0 for cand in tset: if cand not in entry: for toCheck in entry: dist += levenshtein(cand, toCheck) dist /= (len(tset) * len(entry) * 1.0) if dist < 1.3: #print dist, len(tset), len(entry) return ind else: return ind return -1
def testCloseMatch(self): for k1 in self.keys.iterkeys(): timer_start("close_match") d = self.tree.close_match(k1, 4, None, DictAction()) timer_end("close_match") for k2 in self.keys.iterkeys(): distance = levenshtein(k1, k2) if distance <= 4: self.assert_(k2 in d, "Match manquant pour %s : %s (distance = %i)" % (k1, k2, distance)) self.assert_( k2 in d and d[k2][0] == distance, "Mauvaise distance pour %s et %s : %i != %s" % (k1, k2, distance, d.get(k2)[0]), ) else: self.assert_( k2 not in d, "Mauvaise distance pour %s et %s : %i > 4 mais trouvé %s" % (k1, k2, distance, d.get(k2)), )
def getMinSequence(string, slist): mind = None mdist = None i = 0 for tup in slist: tdist = 0 for entry in tup: tdist += levenshtein(string, entry) dist = tdist / (len(tup) * 1.0) if dist < mdist or mdist == None: mdist = dist mind = i i += 1 if mdist < 3: return mind else: return None
def compare_ref(usertrans, targettrans): """ Takes the target translation and the user translation as inputs. Based on their edit distance returns an evaluation. @ targettrans: target translation (ideal translation of a text) @ usertrans: translation provided by the user """ evaluation = {'very good': ['Superb translation!', 'Great work!', 'Perfect score!', 'High five!'], \ 'good': ['Good translation!', 'Nice work!', 'Almost perfect!'], \ 'fair': ['Not bad!', 'Almost there!'], \ 'average': ['You can do better!', 'Shall we practice a little more?'] } # encode to UTF-8 tt = targettrans.encode('utf-8') ut = usertrans.encode('utf-8') # remove punctuation replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation)) tt = tt.translate(replace_punctuation).lower() ut = ut.translate(replace_punctuation).lower() lev_let = lev.ratio(tt, ut) tt = tt.split() ut = ut.split() lensum = len(tt)+len(ut) lev_tok = (lensum - distance.levenshtein(tt, ut)) / lensum ratio = max(lev_tok, lev_let) if ratio >= 0.9: return random.choice(evaluation['very good']) elif ratio >= 0.75: return random.choice(evaluation['good']) elif ratio >= 0.6: return random.choice(evaluation['fair']) else: return random.choice(evaluation['average'])
def lexem_best_match(token, lexems): if len(lexems) == 0: return token # print "TOKEN: ", token # print lexems best_match = None best_count = len(token) for match in lexems: if match[1] != 'P' and match[1] != 'V': lexem = match[0].lower() count_diff = distance.levenshtein(token, lexem) if count_diff < best_count: best_count = count_diff best_match = lexem # print "BEST_MATCH: ", best_match, best_count if best_match: return strip_diacritics_1(best_match.lower()) return None
def main_batches(): g = Graph(is_training=False) # Load data nums, X, ys = load_test_data() pnyn2idx, idx2pnyn, hanzi2idx, idx2hanzi = load_vocab() with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)); print("Restored!") # Get model mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name with codecs.open('eval/{}_{}.csv'.format(mname, "qwerty" if hp.isqwerty else "nine"), 'w', 'utf-8') as fout: fout.write("NUM,EXPECTED,{}_{},# characters,edit distance\n".format(mname, "qwerty" if hp.isqwerty else "nine")) total_edit_distance, num_chars = 0, 0 for step in range(len(X)//hp.batch_size): num = nums[step*hp.batch_size:(step+1)*hp.batch_size] #number batch x = X[step*hp.batch_size:(step+1)*hp.batch_size] # input batch y = ys[step*hp.batch_size:(step+1)*hp.batch_size] # batch of ground truth strings preds = sess.run(g.preds, {g.x: x}) for n, xx, pred, expected in zip(num, x, preds, y): # sentence-wise #got = "".join(idx2hanzi[str(idx)] for idx in pred)[:np.count_nonzero(xx)].replace("_", "") got = "".join(idx2hanzi[idx] for idx in pred)[:np.count_nonzero(xx)].replace("_", "") edit_distance = distance.levenshtein(expected, got) total_edit_distance += edit_distance num_chars += len(expected) fout.write(u"{},{},{},{},{}\n".format(n, expected, got, len(expected), edit_distance)) fout.write(u"Total CER: {}/{}={},,,,\n".format(total_edit_distance, num_chars, round(float(total_edit_distance)/num_chars, 2)))