def kmedias(data, k, distance, iteration): base = data C_1 = np.random.randint(np.min(base), np.max(base), size=k) C = np.array(list(zip(C_1)), dtype=np.uint8) #print(C) #clusters anteriores C_ant = np.zeros(C.shape) #error error = np.linalg.norm(C - C_ant, axis=1) #clusters clusters = np.zeros(len(base)) aux = math.inf test = 0 while aux >= 0.001: if test > iteration: break for i in range(len(base)): if distance == 'euclidean': distancias = np.linalg.norm(base[i] - C, axis=1) else: distancias = np.zeros(k) for w in range(k): distancias[w] = chebyshev(base[i], C[w]) #print(distancias) cluster = np.argmin(distancias) clusters[i] = cluster C_ant = deepcopy(C) clusters = np.reshape(clusters, (clusters.shape[0], 1)) for i in range(k): points = [base[j] for j in range(len(base)) if clusters[j] == i] C[i] = np.mean(points, axis=0) #print(points.shape) if distance == 'euclidean': error = np.linalg.norm(C - C_ant, axis=1) else: error = chebyshev(C, C_ant) aux = np.sum(error) print(aux) test = test + 1 print(test) return points, clusters, C
def sortneighbors(self, x, y, X_train, x_test): x = np.array(x).astype(np.float) x_test = np.array(x_test).astype(np.float) dist = np.empty(len(x)) for i in range(len(x)): # st=globals()["distance."+self.metric] # dist=st(x_train,x_test) if self.metric == 'cosine': dist[i] = distance.cosine(x[i], x_test) elif self.metric == 'chebyshev': dist[i] = distance.chebyshev(x[i], x_test) elif self.metric == 'cityblock': dist[i] = distance.cityblock(x[i], x_test) elif self.metric == 'euclidean': dist[i] = distance.euclidean(x[i], x_test) elif self.metric == 'minkowski': dist[i] = distance.minkowski(x[i], x_test) else: print( 'Error!!! Enter a correct distance function and try again \n' ) dist = np.argsort( dist ) # Returning the indices of the similarity values (distance values are sorted in ascending order) x_sorted = np.empty(shape=(len(x), len(X_train[1]))) y_sorted = [] k = 0 for i in dist: x_sorted[k] = x[i] y_sorted.append(y[i]) k = k + 1 return x_sorted, y_sorted
def calculateL2(self, feat1, feat2, c_type='euclidean'): assert np.shape(feat1) == np.shape(feat2) if config.insight: [ len_, ] = np.shape(feat1) #print(np.shape(feat1)) else: _, len_ = np.shape(feat1) #print("len ",len_) if c_type == "cosine": s_d = distance.cosine(feat1, feat2) elif c_type == "euclidean": #s_d = np.sqrt(np.sum(np.square(feat1-feat2))) #s_d = distance.euclidean(feat1,feat2,w=1./len_) s_d = distance.euclidean(feat1, feat2, w=1) elif c_type == "correlation": s_d = distance.correlation(feat1, feat2) elif c_type == "braycurtis": s_d = distance.braycurtis(feat1, feat2) elif c_type == 'canberra': s_d = distance.canberra(feat1, feat2) elif c_type == "chebyshev": s_d = distance.chebyshev(feat1, feat2) return s_d
def distance(x, y, weights = [], p = 3, method = "euclidean"): ''' :param weights: :param p: :param x: X vector :param y: Y vector :param method: Method to Find Distance :return: The Distance Value ''' value = 0.00 if method == "euclidean": value = distance.euclidean(x, y) elif method == "minkowski": value = distance.minkowski(x, y, p) elif method == "cosine": value = distance.cosine(x, y) elif method == "manhattan": value = distance.cityblock(x, y) elif method == "dice": value = distance.dice(x, y) elif method == "jaccard": value = distance.jaccard(x, y) elif method == "hamming": value == distance.hamming(x, y) elif method == "canbera": value == distance.chebyshev(x, y) else: print(method, " Not Found! unsing Eclidean Distance!") value = distance.euclidean(x, y) return value
def tmpAssignPoints(self, centroids): print "WYBRANA METRYKA: "+ str(self.metric) distCount = lambda a, b: euclidean(a, b) if self.metric == "chebyshev": distCount = lambda a, b: chebyshev(a, b) if self.metric == "cityblock": distCount = lambda a, b: cityblock(a, b) # chebyshev = nieskonczonosc # distCount = lambda x, y: chebyshev(x, y) #cityblock = l1 # distCount = lambda x, y: cityblock(x, y) labels_centroids=[] for i in self.df.index: disctances=[] c_dist = [] for c in centroids: # print "i: "+str(self.df.loc[i].values)+ " c: "+str(c) x= distCount(self.df.loc[i].values, c) # print "xx: "+str(x) #klucz-x, wartosc-c disctances.append(x) c_dist.append(c) m = min(disctances) # print "distances:" # print disctances # print "c_dist" # print c_dist dm = disctances.index(m) tmp_nearest_centr = c_dist[dm] labels_centroids.append(tmp_nearest_centr) # print"closest centr: "+str(tmp_nearest_centr ) return (labels_centroids, centroids)
def color_histogram_shot_labels( config, histograms: Sequence[bytes]) -> Sequence[bytes]: hists = [readers.histograms(byts, config.protobufs) for byts in histograms] # Compute the mean difference between each pair of adjacent frames diffs = np.array([ np.mean([ distance.chebyshev(hists[i - 1][j], hists[i][j]) for j in range(3) ]) for i in range(1, len(hists)) ]) diffs = np.insert(diffs, 0, 0) n = len(diffs) # Do simple outlier detection to find boundaries between shots positive_boundaries = [] negative_boundaries = [] for i in range(1, n): window = diffs[max(i - WINDOW_SIZE, 0):min(i + WINDOW_SIZE, n)] if diffs[i] - np.mean(window) > POSITIVE_OUTLIER * np.std(window): positive_boundaries.append(i) if diffs[i] - np.mean(window) < NEGATIVE_OUTLIER * np.std(window): negative_boundaries.append(i) return [pickle.dumps((positive_boundaries, negative_boundaries)) ] + ['\0' for _ in range(len(histograms) - 1)]
def test_standard_chebyshev_call_works(tmpdir, sample_config): sample_config['DYESCORE_DATA_DIR'] = tmpdir.strpath ds = DyeScore(write_config_file(tmpdir, sample_config)) random_array = np.random.rand(5, 2) snippet_ids = ['0', '1', '2', '3', '4'] # 0 index for sanity :D data = xr.DataArray(random_array, coords={ 'snippet': snippet_ids, 'symbol': ['window.navigator', 'canvas.context'], }, dims=('snippet', 'symbol')) f = ds.dye_score_data_file('snippets') data.to_dataset(name='data').to_zarr(store=ds.get_zarr_store(f)) # Run Test dye_snippets = ['2'] result_file = ds.compute_distances_for_dye_snippets(dye_snippets, override=True) # Check Results results = xr.open_zarr(store=ds.get_zarr_store(result_file))['data'] assert results.shape == (5, 1) for s in snippet_ids: actual_result = results.sel(snippet=s, dye_snippet='2').values expected_result = chebyshev(random_array[2], random_array[int(s)]) assert actual_result == expected_result
def test_chebyshev_func(): """Note the injection of an extra dimension which happens when the xarray apply ufunc is put together. For (5,1,2) all data looks like: [[[0.34180806 0.92010143]], [[0.69717685 0.24012436]], [[0.3362796 0.08151153]], [[0.74861764 0.94125763]], [[0.25078923 0.3294995 ]]] dye data is just one row of this [[0.74861764 0.94125763]] returned data is: [[0.64183828], [0.34977852], [0. ], [0.63256378], [0.06286615]] """ random_array = np.random.rand(5, 1, 2) for dye_snippet in [0, 1, 4]: dye_snippet_result = get_chebyshev_distances_xarray_ufunc( random_array, random_array[dye_snippet]) assert dye_snippet_result.shape == (5, 1) for i, actual_result in enumerate(dye_snippet_result): expected_result = chebyshev(random_array[dye_snippet][0], random_array[i][0]) assert actual_result == expected_result
def color_histogram_shot_labels(histogram, WINDOW_SIZE, POSITIVE_OUTLIER_THRESHOLD, NEGATIVE_OUTLIER_THRESHOLD, dim=3): histogram = list(histogram) # Compute the mean difference between each pair of adjacent frames diffs = np.array([ np.mean([ distance.chebyshev(histogram[i - 1][j], histogram[i][j]) for j in range(dim) ]) for i in range(1, len(histogram)) ]) diffs = np.insert(diffs, 0, 0) n = len(diffs) # Do simple outlier detection to find boundaries between shots positive_boundaries = [] negative_boundaries = [] for i in range(1, n): window = diffs[max(i - WINDOW_SIZE, 0):min(i + WINDOW_SIZE, n)] if diffs[i] - np.mean( window) > POSITIVE_OUTLIER_THRESHOLD * np.std(window): positive_boundaries.append(i) if diffs[i] - np.mean( window) < NEGATIVE_OUTLIER_THRESHOLD * np.std(window): negative_boundaries.append(i) return positive_boundaries, negative_boundaries
def calculate_distance(X, Y, metric='euclidean'): if metric == METRIC_EUCLIDEAN: return distance.euclidean(X, Y) elif metric == METRIC_JACCARD: return distance.jaccard(X, Y) elif metric == METRIC_CANBERRA: return distance.canberra(X, Y) elif metric == METRIC_CHEBYSHEV: return distance.chebyshev(X, Y) elif metric == METRIC_MINKOWSKI: return distance.minkowski(X, Y) elif metric == METRIC_WMINKOWSKI: return distance.wminkowski(X, Y) elif metric == METRIC_BRAYCURTIS: return distance.braycurtis(X, Y) elif metric == METRIC_HAMMING: return distance.hamming(X, Y) elif metric == METRIC_MAHALANOBIS: return distance.mahalanobis(X, Y) elif metric == METRIC_MANHATTAN: return sum(abs(a - b) for a, b in zip(X, Y)) elif metric == METRIC_COSINE: dot_product = np.dot(X, Y) norm_a = np.linalg.norm(X) norm_b = np.linalg.norm(Y) return dot_product / (norm_a * norm_b)
def Chebyshev(y1, y2, verbose=False): """ Chebyshev distance between 2 vectors. Input: y1 - list-like object y2 - list-like object Output: distance - sum of absolute difference Example: y1 = [1,3,5,7] y2 = [1,3,7,9] Chebyshev(y1, y2, verbose=True) => y1: [1 3 5 7] y2: [1 4 7 10] Chebyshev Distance: 3 """ y1 = np.asarray(y1) y2 = np.asarray(y2) dist = distance.chebyshev(y1, y2) if verbose: print("y1:", y1) print("y2:", y2) print("Chebyshev Distance:", dist) return dist
def closest_point(point, list, matrix): if np.shape(list)[0] == 0: return point distances = [] for item in list: distances.append(distance.chebyshev(point, item)) minimum = min(distances) index = distances.index(minimum) move = list[distances.index(minimum)] while minimum == 1 and matrix[point[0]][point[1]] < (threshold[j]+1)*255 - 1: distances.pop(index) if np.shape(distances)[0] == 0: break minimum = min(distances) index = distances.index(minimum) move = list[index] if np.shape(distances)[0] == 0: return point else: return move
def test_compare_histograms(self): d1 = np.random.normal(loc=0.0, scale=1.0, size=20000) d2 = np.random.normal(loc=5.0, scale=1.0, size=20000) d1 = np.float32(d1) d2 = np.float32(d2) max1 = max(d1) max2 = max(d2) maxboth = max(max1, max2) minboth = min(min(d1), min(d2)) hist1, binsout = np.histogram(d1, range=(minboth, maxboth), bins=40) hist1 = np.float32(hist1) hist1 = cv2.normalize(hist1).flatten() hist2, binsout = np.histogram(d2, range=(minboth, maxboth), bins=40) hist2 = np.float32(hist2) hist2 = cv2.normalize(hist2).flatten() rate_fp.display_two_histograms(hist1, hist2, binsout) print('euclidean dist:' + str(dist.euclidean(hist1, hist2))) print('cityblock dist:' + str(dist.cityblock(hist1, hist2))) print('chebyshev dist:' + str(dist.chebyshev(hist1, hist2))) print('correlation:' + str(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_CORREL))) print('chisqr:' + str(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_CHISQR))) print('intersection:' + str(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_INTERSECT))) print('bhatta:' + str(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_BHATTACHARYYA)))
def kMeans(self): for numClusters in range(self.minClusters, self.maxClusters + 1): self.gain[numClusters] = {} self.gain[numClusters]["avg"] = [] for rep in range(n): clustId = numClusters print "Running on %s clusters, rep %s" % (numClusters, rep + 1) self.gain[clustId]["labels"] = list(KMeans(numClusters).fit(np.array(self.data)).labels_) centroids = [[0 for x in range(len(self.data[0]))] for y in range(numClusters)] print "\tFinding Centroids" for index, pt in enumerate(self.data): cluster = self.gain[clustId]["labels"][index] prevCenter = centroids[cluster] centroids[cluster] = self._solve_centroid(pt, prevCenter) self.gain[clustId]["cosine"] = 0 self.gain[clustId]["cheby"] = 0 self.gain[clustId]["euclid"] = 0 self.gain[clustId]["jaccard"] = 0 for index, pt in enumerate(self.data): cluster = self.gain[clustId]["labels"][index] centroid = centroids[cluster] self.gain[clustId]["cosine"] += distance.cosine(centroid, pt) / len(self.data) self.gain[clustId]["cheby"] += distance.chebyshev(centroid, pt) / len(self.data) self.gain[clustId]["jaccard"] += distance.correlation(centroid, pt) / len(self.data) marginGain = self.bestMarginalGain(clustId, rep, centroids) if marginGain[0] is False: return marginGain[1], self.gain[marginGain[0]]["labels"] print "Max clusters is best marginal gain," + \ "consider rerunning with higher max" return self.maxClusters, self.gain[clustId]["labels"]
def dist_chebishev(img_a, img_b): hist_a = cv2.calcHist([img_a], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) hist_a = cv2.normalize(hist_a, hist_a).flatten() hist_b = cv2.calcHist([img_b], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) hist_b = cv2.normalize(hist_b, hist_b).flatten() return dist.chebyshev(hist_a, hist_b)
def chebyshev_between_parts_and_compound(part1_vecs, part2_vecs, comp_vecs, true_class): parts_mean = get_mean(part1_vecs, part2_vecs) chebyshevs = [] for w, c in zip(parts_mean, comp_vecs): chebyshevs.append(abs(distance.chebyshev(w, c))) print(spearmanr(chebyshevs, true_class)[0])
def cacaulcDis(it, tgt, dir, type): ret = [] if type == "euclidean": for i in it: ret.append( distance.euclidean(numpy.array(i[1], dtype=object), numpy.array(tgt, dtype=object))) f = open(dir, 'w') for j in ret: f.writelines(str(j) + "\n") f.close() if type == "cosine": for i in it: ret.append( distance.cosine(numpy.array(i[1], dtype=object), numpy.array(tgt, dtype=object))) f = open(dir, 'w') for j in ret: f.writelines(str(j) + "\n") f.close() if type == "chebyshev": for i in it: ret.append( distance.chebyshev(numpy.array(i[1], dtype=object), numpy.array(tgt, dtype=object))) f = open(dir, 'w') for j in ret: f.writelines(str(j) + "\n") f.close() if type == "manhattan": for i in it: size = len(i[1]) j = 0 temp = 0 while j < size: temp = temp + abs(tgt[j] - i[1][j]) j = j + 1 ret.append(temp) f = open(dir, 'w') for k in ret: f.writelines(str(k) + "\n") f.close() if type == "sortNormEuc": for i in it: ret.append( distance.euclidean( normalize(numpy.array(i[1], dtype=object), 1), normalize(numpy.array(tgt, dtype=object), 1))) f = open(dir, 'w') ret = sorted(ret) for j in ret: f.writelines(str(j) + "\n") f.close()
def linear_kernel(matrix_1, matrix_2): cos = paired_cosine_distances(matrix_1, matrix_2) man = paired_manhattan_distances(matrix_1, matrix_2) euc = paired_euclidean_distances(matrix_1, matrix_2) che = [] for row_1, row_2 in zip(matrix_1, matrix_2): che.append(chebyshev(row_1, row_2)) che = np.asarray(che) out = np.vstack((cos, man, euc, che)).T return out
def pair_coherence(self, word_i, word_j, metric=None): if(metric=="correlation"): return 1 - distance.correlation(self.model[word_i], self.model[word_j]) if(metric=="chebyshev"): return 1 - distance.chebyshev(self.model[word_i], self.model[word_j]) if(metric=="euclidean"): return 1 - distance.euclidean(self.model[word_i], self.model[word_j]) if(metric=="canberra"): return 1 - distance.canberra(self.model[word_i], self.model[word_j]) return self.model.similarity(word_i,word_j)
def classificationChebyshev(indice): M = ldb.getData(indice) mini = np.inf index = 0 for i in range(10): d = distance.chebyshev(M, DATA.matrice_moyenne[i]) if d < mini: index = i mini = d return index, mini
def get_roc_score(X, edges_pos, edges_neg, measure): def sigmoid(x): return x preds = [] d = int(X.shape[1] / 2) for (s, t) in edges_pos: if measure == 'dot': score = np.dot(X[s], X[t]) preds.append(sigmoid(score)) elif measure == 'cosine': preds.append(cosine_similarity([X[s], X[t]])[0, 0]) elif measure == 'hamming': preds.append(1 - hamming(X[s], X[t])) elif measure == 'euclidean': preds.append(-euclidean(X[s], X[t])) elif measure == 'chebyshev': preds.append(-chebyshev(X[s], X[t])) elif measure == 'dot2': preds.append(sigmoid(np.dot(X[s, 0:d], X[t, d:]))) preds_neg = [] for (s, t) in edges_neg: if measure == 'dot': score = np.dot(X[s], X[t]) preds_neg.append(sigmoid(score)) elif measure == 'cosine': preds_neg.append(cosine_similarity([X[s], X[t]])[0, 0]) elif measure == 'hamming': preds_neg.append(1 - hamming(X[s], X[t])) elif measure == 'euclidean': preds_neg.append(-euclidean(X[s], X[t])) elif measure == 'chebyshev': preds_neg.append(-chebyshev(X[s], X[t])) elif measure == 'dot2': preds_neg.append(sigmoid(np.dot(X[s, 0:d], X[t, d:]))) preds_all = np.hstack([preds, preds_neg]) labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))]) roc_score = roc_auc_score(labels_all, preds_all) ap_score = average_precision_score(labels_all, preds_all) return roc_score, ap_score
def feature_construct(city, model_name, friends, walk_len=100, walk_times=20, num_features=128): '''construct the feature matrixu2_checkin Args: city: city model_name: 20_locid friends: friends list (asymetric) [u1, u2] walk_len: walk length walk_times: walk times num_features: dimension for vector Returns: ''' if os.path.exists('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature'): os.remove('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature') emb = pd.read_csv('dataset/'+city+'/emb/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.emb',\ header=None, skiprows=1, sep=' ') emb = emb.rename(columns={0: 'uid'}) # last column is user id emb = emb.loc[emb.uid > 0] # only take users, no loc_type, not necessary pair = pair_construct(emb.uid.unique(), friends) for i in range(len(pair)): u1 = pair.loc[i, 'u1'] u2 = pair.loc[i, 'u2'] label = pair.loc[i, 'label'] u1_vector = emb.loc[emb.uid == u1, range(1, emb.shape[1])] u2_vector = emb.loc[emb.uid == u2, range(1, emb.shape[1])] i_feature = pd.DataFrame([[ u1, u2, label, cosine(u1_vector, u2_vector), euclidean(u1_vector, u2_vector), correlation(u1_vector, u2_vector), chebyshev(u1_vector, u2_vector), braycurtis(u1_vector, u2_vector), canberra(u1_vector, u2_vector), cityblock(u1_vector, u2_vector), sqeuclidean(u1_vector, u2_vector) ]]) i_feature.to_csv('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature',\ index = False, header = None, mode = 'a')
def cheb_dist(user_predict, adoptable_dogs, images): ''' Calculating Chepyshev distance between two 1D arrays and return similarity score ''' sim_score = [] for idx in range(0, len(adoptable_dogs)): sim_score.append( distance.chebyshev(user_predict.flatten(), adoptable_dogs[idx].flatten())) print('Maximum SimScore: ' + str(max(sim_score))) return pd.DataFrame({'imgFile': images, 'SimScore': sim_score})
def similarity_metrics(vec1,vec2,med='all'): """ Function that computes the similarity/distance between two vectors Parameters ---------- vec1 : list numpy array the first vector vec2 : list numpy array the second vector med : string the metric that will be computed Minkowski and Standard Measures Euclidean Distance : 'ED' Cityblock Distance : 'CD' Infinity Distance : 'ID' Cosine Similarity : 'CS' Statistical Measures Pearson Correlation Coefficient : 'PCC' Chi-Square Dissimilarity : 'CSD' Kullback-Liebler Divergence : 'KLD' Jeffrey Divergence : 'JD' Kolmogorov-Smirnov Divergence : 'KSD' Cramer-von Mises Divergence : 'CMD' Returns ------- similarity/distance : float the similarity/distance between the two vectors """ distance = 0 if med == 'ed': distance = euclidean(vec1,vec2) elif med == 'cd': distance = cityblock(vec1, vec2) elif med == 'id': distance = chebyshev(vec1,vec2) elif med == 'cs': distance = cosine(vec1, vec2) elif med == 'pcc': distance = dist_pearson(vec1, vec2) elif med == 'csd': distance = chisquare(vec1, vec2)[0] elif med == 'kld': distance = entropy(vec1,vec2) elif med == 'jd': distance = dist_jeffrey(vec1, vec2) elif med == 'ksd': distance = ks_2samp(vec1, vec2)[0] elif med == 'cmd': distance = dist_cvm(vec1, vec2) return distance
def distance(self, x, y): if self.metric == 'euclidean': return np.sqrt(np.sum((x-y)**2)) elif self.metric == 'chebyshev': return distance.chebyshev(x, y) elif self.metric == 'manhetten': return distance.cityblock(x, y) elif self.metric == 'minkowski': return distance.minkowski(x, y, 2)
def chebyshev(self, x=None, y=None, w=None): """ 切比雪夫距离(Chebyshev distance)是向量空间中的一种度量,二个点之间的距离定义是其各坐标数值差绝对值的最大值。 以数学的观点来看,切比雪夫距离是由一致范数(uniform norm)(或称为上确界范数)所衍生的度量, 也是超凸度量(injective metric space)的一种。计算公式为 x = [1, 2, 0] y = [0, 1, 0] """ x = x or self.x y = y or self.y w = w or self.w return distance.chebyshev(x, y, w)
def calc_chebyshev(query_vec, num_of_docs): # bigger better vec_distances = [] for index, row in data.iterrows(): vec_distances.append(chebyshev(query_vec.toarray(), row['text'])) result_docs = data.copy() result_docs['chebyshev'] = list(vec_distances) result_docs = result_docs.sort_values(by=['chebyshev'], ascending=False) # default: asc result_docs = result_docs.head(num_of_docs) result_docs.drop('chebyshev', axis=1, inplace=True) return result_docs
def cross_channel_distance_features(image): """calculates the cross channel distance features Calculates the distances across channels Parameters ---------- image : 3D array, shape (M, N, C) The input image with multiple channels. Returns ------- features : dict dictionary including different distances across channels """ features = dict() for ch1 in range(image.shape[2]): for ch2 in range(ch1 + 1, image.shape[2]): # rehaping the channels to 1D channel1 = image[:, :, ch1].ravel() channel2 = image[:, :, ch2].ravel() # creating the suffix name for better readability suffix = "_Ch" + str(ch1 + 1) + "_Ch" + str(ch2 + 1) # storing the distance values features["braycurtis_distance" + suffix] = dist.braycurtis( channel1, channel2) features["canberra_distance" + suffix] = dist.canberra( channel1, channel2) features["chebyshev_distance" + suffix] = dist.chebyshev( channel1, channel2) features["cityblock_distance" + suffix] = dist.cityblock( channel1, channel2) features["correlation_distance" + suffix] = dist.correlation( channel1, channel2) features["cosine_distance" + suffix] = dist.cosine( channel1, channel2) features["euclidean_distance" + suffix] = dist.euclidean( channel1, channel2) features["jensenshannon_distance" + suffix] = dist.jensenshannon( channel1, channel2) features["minkowski_distance" + suffix] = dist.minkowski( channel1, channel2) features["sqeuclidean_distance" + suffix] = dist.sqeuclidean( channel1, channel2) return features
def get_hist_diffs(hists): # Add a 0 histogram for the frame before to ensure len(diffs) == len(hists) hists = hists.tolist() pre_hist = [0 for x in hists[0]] hists.insert(0, pre_hist) color_hist_diffs = [ distance.chebyshev(hists[i - 1], hists[i]) for i in range(1, len(hists)) ] color_hist_diffs = np.array(color_hist_diffs) return color_hist_diffs
def gen_dist_feats(vect_img_1, vect_img_2): return [ distance.euclidean(vect_img_1, vect_img_2), distance.braycurtis(vect_img_1, vect_img_2), distance.canberra(vect_img_1, vect_img_2), distance.chebyshev(vect_img_1, vect_img_2), distance.cityblock(vect_img_1, vect_img_2), distance.cosine(vect_img_1, vect_img_2), distance.jensenshannon(vect_img_1, vect_img_2), distance.minkowski(vect_img_1, vect_img_2), skew(np.nan_to_num(vect_img_1)), skew(np.nan_to_num(vect_img_2)), kurtosis(np.nan_to_num(vect_img_1)), kurtosis(np.nan_to_num(vect_img_2)), ]
def get_dist_preds(self, predictions, metric): new_preds = [] for j, pred in enumerate(predictions): distances = [] remaining_preds = predictions[:j] + predictions[j + 1:] for pred_ in remaining_preds: if metric == 'euclid': distances += [euclidean(pred_, pred)] elif metric == 'cosine': distances += [cosine(pred_, pred)] elif metric == 'jaccard': # i think this is only for boolean distances += [jaccard(pred_, pred)] elif metric == 'chebyshev': distances += [chebyshev(pred_, pred)] elif metric == 'correlation': distances += [correlation(pred_, pred)] elif metric == 'cityblock': distances += [cityblock(pred_, pred)] elif metric == 'canberra': distances += [canberra(pred_, pred)] elif metric == 'braycurtis': distances += [braycurtis(pred_, pred)] elif metric == 'hamming': # i think this is only for boolean distances += [hamming(pred_, pred)] elif metric == 'battacharyya': distances += [ DistanceMetrics.battacharyya(pred_, pred, method='continuous') ] new_preds += [(pred, sum(distances))] # (precdictions, weight) weights = [tup[1] for tup in new_preds] W = sum(weights) # total weight if self.sdhw: # those with lower distances have higher weight # sort in ascending order of aggregated distances preds_ascending_dist = sorted(new_preds, key=lambda x: x[1]) weights_descending = sorted(weights, reverse=True) weighted_pred = sum([ pred_tup[0] * (weights_descending[k] / W) for k, pred_tup in enumerate(preds_ascending_dist) ]) else: # those with lower distances have lower weight weighted_pred = sum( [pred_tup[0] * (pred_tup[1] / W) for pred_tup in new_preds]) return weighted_pred
def calculate_histogram_overlap(same_distances_arrays, different_distances_arrays, bins=50): ''' see http://www.pyimagesearch.com/2014/07/14/3-ways-compare-histograms-using-opencv-python/#comment-322678 :param same_distances_arrays: array of distance arrays for same items :param different_distances_arrays: array of distance arrays for different items :param bins: number of histogram bins :return: dictionary of various distance measures. watch out, some increase w. similarity, others decrease ''' totsame = [] # np.ndarray.flatten(same_distances_arrays) totdiff = [] # np.ndarray.flatten(-different_distances_arrays) # flatten the array of arrays. could prob. use flatten() for this for same_distances in same_distances_arrays: for val in same_distances: totsame.append(val) for different_distances in different_distances_arrays: for val in different_distances: totdiff.append(val) hist1, binsout = np.histogram(totsame, bins=bins) hist1 = np.float32(hist1) hist1 = cv2.normalize(hist1).flatten() hist2, binsout = np.histogram(totdiff, bins=bins) hist2 = np.float32(hist2) hist2 = cv2.normalize(hist2).flatten() # print(self_report) same_item_average = np.mean(totsame) cross_item_average = np.mean(totdiff) same_item_error = np.std(totsame) cross_item_error = np.std(totdiff) numerator = cross_item_average - same_item_average mychi = numerator / (np.sqrt(same_item_error ** 2 + cross_item_error ** 2)) print('same avg {0} same var {1} '.format(same_item_average, same_item_error)) print('cross avg {0} cross var {1} '.format(cross_item_average, cross_item_error)) results = {"Correlation": float(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_CORREL)), "Chi-Squared": float(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_CHISQR)), "Intersection": float(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_INTERSECT)), "Bhattacharyya": float(cv2.compareHist(hist1, hist2, cv2.cv.CV_COMP_BHATTACHARYYA)), "Euclidean": float(dist.euclidean(hist1, hist2)), "Manhattan": float(dist.cityblock(hist1, hist2)), "Chebysev": float(dist.chebyshev(hist1, hist2)), "mychi": float(mychi)} return results
def compHist(hist1, hist2, method, formula): """Compare two histograms with given method and formula. Parameters ---------- hist1 : 1D array The first histogram hist2 : 1D array The second histogram method : str(cv integer) Options for method ('cv_comp', 'scipy_comp', 'kl_div') formula: str(cv integer) Options for formula. For method == 'cv_comp' (cv.CV_COMP_CORREL, cv.CV_COMP_CHISQR, cv.CV_COMP_INTERSECT, cv.CV_COMP_BHATTACHARYYA) For method == 'scipy_comp' ("Euclidean", "Manhattan", "Chebysev") """ ## using opencv if method == 'cv_comp': dis = cv2.compareHist(np.float32(hist1), np.float32(hist2), formula) if formula == cv.CV_COMP_CORREL: dis = -dis + 1 ## using Scipy distance metrics if method == 'scipy_comp': if formula == 'Euclidean': dis = dist.euclidean(hist1, hist2) if formula == 'Manhattan': dis = dist.cityblock(hist1, hist2) if formula == 'Chebysev': dis = dist.chebyshev(hist1, hist2) ## using KL divergence hist1 = np.float32(hist1) + 1 hist2 = np.float32(hist2) + 1 if method == 'kl_div': kbp = np.sum(hist1 * np.log(hist1 / hist2), 0) kbq = np.sum(hist2 * np.log(hist2 / hist1), 0) dis = np.double(kbp + kbq)/2 return dis
def assignPoints(self, centroids): # centroids # l = len(centroids) # for i in range(0,l): # centroids[i]+=random.random() #set to True when there is a change in assigning points to clusters(centroids) changed = False assignedCentroids = pd.DataFrame() for i in self.df.index: distances = {} for c in centroids.index: if self.metric == "euclidean": x = self.myEuclidean(self.df.loc[c], self.df.loc[i]) if self.metric == "chebyshev": x = chebyshev(self.df.loc[c], self.df.loc[i]) if self.metric == "cityblock": x = cityblock(self.df.loc[c], self.df.loc[i]) # print"i: "+str(i)+" c: "+str(c) # print "self.df.loc[i]: "+ str(self.df.loc[i]) # print "self.df.loc[c]: "+ str(self.df.loc[c]) # print "x: "+str(x) #dictionary that stores centroid as a key and distance between point and centroid as a value distances[c] = x # find the minimum by comparing the second element of each tuple (values) m=min(distances.items(), key=lambda x: x[1]) #m[0] is a key of a min value in a dictionary, so m[0] is centroid # point i 'belongs' to centroid m[0] # if not assignedCentroids.at[i,'centroids']==m[0]: #if centroid is changed # changed=True # changed=True # assignedCentroids.at[i,'centroids']=m[0] assignedCentroids.at[i]=m[0] # print "centroidyyyyyyyyyy" # print assignedCentroids return (assignedCentroids, changed)
def _D(self,x,y,metric): """ Calculates the distance between x and y according to metric 'metric' Parameters ---------- x : numpy array 1-d vector of dimension d y : numpy array 1-d vector of dimension d metric: str specify the metric used (default euclidian metric) Returns ------- D(x | y) : Distance between x and y according to metric """ if metric == 'euclid' or metric == 'Euclid': return np.linalg.norm(x-y) if metric == 'kolmogorov' or metric == 'Kolmogorov': #check normalization norm_x = np.around(np.linalg.norm(x),decimals=10) norm_y = np.around(np.linalg.norm(y),decimals=10) if norm_x == 1 and norm_y == 1: return np.sqrt(1 - np.around(np.absolute(np.dot(x,y))),decimals=10) else: raise NameError('%s metric supports only normalized vectors' % metric) if metric == 'chebyshev' or metric == 'Chebyshev': return ssd.chebyshev(x,y) else: raise NameError('%s metric not supported' % metric)
def fit(self): from numpy.linalg import norm from scipy.spatial.distance import chebyshev if not self._is_standardized: self._data_z_ = _standardize_with_scaler(self._data) self._is_standardized = True n_samps, n_feats = self.shape() em = [] #endmembers cnt = np.zeros(n_samps) # Initial LIS lis = [] #lattice independent sources idx = np.random.randint(0, n_samps) p = 1 #number of current endmembers # Algorithm Initialization # Initialize endmembers set and index vector cnt[idx] = 1 samp = self._data_z_[idx, :] lis.append(samp) is_new_lis = True #data signs signs = [] signs.append(np.sign(samp)) #saving endmembers em.append(self._data[idx, :]) #indicates wich pixels is identified as an endmember idxs = [] idxs.append(idx) #Run over each sample for i in range(n_samps): #check for LAAM recalculation if is_new_lis: #recalculate LAAM laam = LAM(lis) wxx, mxx = laam.fit() is_new_lis = False #sample samp = self._data[i, :] samp_sign = np.sign(samp) if np.sum(np.abs(samp)) > 0: if self._alpha <= 0: #check if pixel is lattice dependent #y = np.zeros(n_feats) #vector version samps = np.tile(samp, (n_feats, 1)) y = np.max(wxx + samps, axis=1) #for loop version #for j in range(n_feats): # y[j] = np.max(wxx[:,j] + samp) #find the most similar and check the norms sum_signs = 0 selected_em = 0 for e in range(p): asigns = np.array(signs) sum_signs_em = np.sum(asigns[e, :] == samp_sign) if sum_signs_em > sum_signs: sum_signs = sum_signs_em selected_em = e alis = np.array(lis) if norm(alis[selected_em, :]) < norm(samp): #substitute lis new_lis = True cnt [i] = 1 cnt [idx[selected_em]] = 0 idx [selected_em] = i lis [selected_em] = samp signs[selected_em] = np.sign(samp) em [selected_em] = self._data[i, :] continue #end if self._alpha <= 0 else: # Chebyshev-Best approximation x_sharp = np.zeros(n_feats) wxx_conj = -wxx for j in range(n_feats): x_sharp[j] = np.min(wxx_conj[:, j] + samp) mu = np.max(wxx[:, j] + x_sharp) mu = np.max(mu + samp)/2 c1 = np.zeros(n_feats) for j in range(n_feats): c1[j] = np.max(wxx[:, j] + mu + x_sharp) if chebyshev(c1, samp) < self._alpha: #find the most similar and check the norms sum_signs = 0 selected_em = 0 for e in range(p): asigns = np.array(signs) sum_signs_em = np.sum(asigns[e, :] == samp_sign) if sum_signs_em > sum_signs: sum_signs = sum_signs_em selected_em = e alis = np.array(lis) if norm(alis[selected_em, :] < norm(samp)): # substitute LIS is_new_lis = True cnt [i] = 1 cnt [idx[selected_em]] = 0 idx [selected_em] = i lis [selected_em] = samp signs[selected_em] = np.sign(samp) em [selected_em] = self._data[i, :] continue #Max-Min dominance mu1 = 0 mu2 = 0 for j in range(1, p+2): s1 = np.zeros(n_feats) s2 = np.zeros(n_feats) for k in range(1, p+2): if j != k: if j == p+1: vi = samp else: vi = lis[j] if k == p+1: vk = samp else: vk = lis[k] d = vi - vk m1 = np.max(d) m2 = np.min(d) s1 = s1 + (d == m1) s2 = s2 + (d == m2) mu1 = mu1 + (np.max(s1) == p) mu2 = mu2 + (np.max(s2) == p) if mu1 == (p+1) or mu2 == (p+1): #new lis p += 1 cnt[i] = 1 idxs.append(1) lis.append(samp) signs.append(samp_sign) em.append(self._data[i, :]) self.em_ = np.array(em) self.cnt_ = cnt return self.em_, self.cnt_
#similarity.py from scipy.spatial import distance as dist import numpy as np np.random.seed(42) x = np.random.rand(4) y = np.random.rand(4) print x print y #Euclidean distance print "Euclidean Distance {0}".format(dist.euclidean(x,y)) #City-Block distance print "City-block Distance {0}".format(dist.cityblock(x,y)) #Chebyshev distance print "Chebyshev Distance {0}".format(dist.chebyshev(x,y))
overlappingVectorAnotherUser.append(userProfileMatrix[anotherUser][1]) if len(overlappingVectorUser) != 0: if distanceMetric == 1: # The first distance metric : Euclidean comparisonDump[user][anotherUser] = (len(overlappingVectorUser) / float(totalMovies - 1)) * (1.0 / (1 + dist.euclidean(overlappingVectorUser, overlappingVectorAnotherUser))) elif distanceMetric == 2: # The second distance metric : Manhattan # The caveat here is that cityblock distance is an int and hence the numerator should be 1.0 and not 1 # Observation: This metric doesn't involve squaring and square root and thus is faster comparisonDump[user][anotherUser] = (len(overlappingVectorUser) / float(totalMovies - 1)) * (1.0 / (1 + dist.cityblock(overlappingVectorUser, overlappingVectorAnotherUser))) elif distanceMetric == 3: # The third distance metric : Chebyshev comparisonDump[user][anotherUser] = (len(overlappingVectorUser) / float(totalMovies - 1)) * (1.0 / (1 + dist.chebyshev(overlappingVectorUser, overlappingVectorAnotherUser))) print "Distance amongst %d and others computed" % (user) nearestNeighbours[user] = [] # Get the top k nearest neighbours for nearest in hq.nlargest(kNeighbours, comparisonDump[user]): nearestNeighbours[user].append([comparisonDump[user].index(nearest), nearest]) # o.write(str(nearestNeighbours[user]) + "\n") ############################################################################################################ # Building suggestions based on k nearest neighbours
def scipy_chebyshev(h1, h2, **kwargs): return -sci_dist.chebyshev(h1, h2)
def are_similar(self, first, second): return dist.chebyshev(first, second)
__author__ = 'jheaton' import os import sys from scipy.spatial import distance # Find the AIFH core files aifh_dir = os.path.dirname(os.path.abspath(__file__)) aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) # Create three different positions. pos1 = [1.0, 2.0, 3.0] pos2 = [4.0, 5.0, 6.0] pos3 = [7.0, 8.0, 9.0] # Calculate the distance between the specified points in 3 metrics. print("Euclidean Distance") print("pos1->pos2: " + str(distance.euclidean(pos1, pos2))) print("pos2->pos3: " + str(distance.euclidean(pos2, pos3))) print("pos3->pos1: " + str(distance.euclidean(pos3, pos1))) print("\nManhattan (city block) Distance\n") print("pos1->pos2: " + str(distance.cityblock(pos1, pos2))) print("pos2->pos3: " + str(distance.cityblock(pos2, pos3))) print("pos3->pos1: " + str(distance.cityblock(pos3, pos1))) print("\nChebyshev Distance\n") print("pos1->pos2: " + str(distance.chebyshev(pos1, pos2))) print("pos2->pos3: " + str(distance.chebyshev(pos2, pos3))) print("pos3->pos1: " + str(distance.chebyshev(pos3, pos1)))
def chebyshev((x, y)): return distance.chebyshev(x, y)
def test_distance_linf(): assert_almost_equal(chebyshev([0, 0], [1, 1]), 1)
def metrykaCzebyszewa(self,array1, array2): return chebyshev(array1,array2)
#print "[+] Matrix in use is: \n", x_matrix #print "===================================================================" temp_max = np.zeros(sample_size) temp_min = np.zeros(sample_size) min_array = np.zeros(sample_size) max_array = np.zeros(sample_size) ratios = np.zeros(sample_size) for i in range(sample_size): temp_min[i] = sys.maxint temp_max[i] = -1 for i in range(sample_size): for j in range(sample_size): if i != j: if (dist.chebyshev(x_matrix[i],x_matrix[j]) < temp_min[i]): min_array[i] = dist.chebyshev(x_matrix[i],x_matrix[j]) temp_min[i] = min_array[i] if (dist.chebyshev(x_matrix[i],x_matrix[j]) > temp_max[i]): max_array[i] = dist.chebyshev(x_matrix[i],x_matrix[j]) temp_max[i] = max_array[i] for i in range(sample_size): ratios[i] = min_array[i]/max_array[i] #print "[+] Min distances are: \n", min_array #print "===================================================================" #print "[+] Max distances are: \n", max_array #print "===================================================================" #print "[+] Ratios are: \n", ratios print "*******************************************************************"
def l1_and_lmax_err(guess): return (distance.cityblock(topics, guess), distance.chebyshev(topics, guess))
def chebyshev_distance(a,b): return distance.chebyshev(a,b)
def wvCheb(a): return [distance.chebyshev(x[0], x[1]) for x in a]
def chebyshev_distance(a,b): print distance.chebyshev(a,b)