def setUpMod(self, mode='rnd'): np.random.seed(626) if mode == 'rnd': points = 50 dim = 500 self.vector = 99. * (np.random.rand(points, dim) - 0.5) self.label = np.random.randint(0, 5, points) self.dist = euclidean_distance(self.vector) # scale to [0, 1), avoiding 1: otherwise sparseMP != denseMP (by design) self.dist /= (self.dist.max() + 1e-12) elif mode == 'toy': # MP empiric ground truth calculated by hand for this toy example self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9]) # MP with div/(n-0) self.mp_dist_truth = squareform( [.6, .4, 1., .8, .6, .8, 1., 1., .8, 1.]) """ # MP with div/(n-1) self.mp_dist_truth = squareform([.5, .25, 1., .75, .5, .75, 1., 1., .75, 1.]) # MP with div/(n-2) self.mp_dist_truth = squareform([1/3, 0., 1., 2/3, 1/3, 2/3, 1., 1., 2/3, 1.]) """ self.vector = None self.label = None
def test_hubness_return_values_are_self_consistent(self): """Test that the three returned values fit together""" np.random.seed(626) points = 200 dim = 500 vector = 99. * (np.random.rand(points, dim) - 0.5) dist = euclidean_distance(vector) k = 10 Sk10, Dk10, Nk10 = hubness(dist, k=k) # Dk is just checked for correct shape correct_dim_Dk10 = Dk10.shape == (points, k) # Count k-occurence (different method than in module) Dk10 = Dk10.ravel() Nk10_true = np.zeros(points, dtype=int) for i in range(points): Nk10_true[i] = (Dk10 == i).sum() correct_Nk10 = np.all(Nk10 == Nk10_true) # Calculate skewness (different method than in module) x0 = Nk10 - Nk10.mean() s2 = (x0**2).mean() m3 = (x0**3).mean() s = m3 / (s2**1.5) Sk10_true = s correct_Sk10 = Sk10 == Sk10_true return self.assertTrue(correct_dim_Dk10 and correct_Nk10 and correct_Sk10)
def setUp(self): points = 100 dim = 10 self.vector = 99. * (np.random.rand(points, dim) - 0.5) self.label = np.random.randint(0, 5, points) self.dist = euclidean_distance(self.vector) self.SEC_DIST = set([ 'mp', 'mp_gaussi', 'mp_gammai', 'ls', 'nicdm', 'snn', 'cent', 'wcent', 'lcent', 'dsg', 'dsl', 'orig' ])
def calculate_AUC(X, embed): # FPR, TPRを近傍数kごとに算出 # 高次元の近傍数は20, 低次元の近傍数は1~100 # fpr.shape = (100, ) # D_XとD_embedは昇順にソートされた距離行列(0は除く) k_high = 20 Ks = 100 # 低次元での近傍数の数 n = X.shape[0] r_i = 20 k_i = [a for a in range(1, 101)] D_X = euclidean_distance(X) D_embed = euclidean_distance(embed) sortD_X, sortD_X_idx = sort_D(D_X, k=20) sortD_embed, sortD_embed_idx = sort_D(D_embed, k=100) # n×100-matrix n_precision = np.zeros((n, len(k_i)), dtype=float) # precision n_recall = np.zeros((n, len(k_i)), dtype=float) # recall # print(n_precision.shape, n_recall.shape) for i in range(n): for j in range(100): tp = np.intersect1d(sortD_X_idx[i, :], sortD_embed_idx[i, :j + 1]) # fp = np.setdiff1d(sortD_X_idx[i, :], sortD_embed_idx[i, :j + 1]) if len(tp) > 0: n_precision[i, j] += len(tp) / (j + 1.) n_recall[i, j] += len(tp) / 20. # if len(fp) > 0: # n_recall[i, j] += len(fp) / 20. # print(n_tp, n_fp) average_precision = np.mean(n_precision, axis=0) average_recall = np.mean(n_recall, axis=0) # print(fpr, tpr) auc = metrics.auc(average_recall, average_precision) return auc
def create_knngraph(X, k): n = X.shape[0] D = euclidean_distance(X) neigh_dist = np.zeros((n, k), ) neigh_idx = np.zeros((n, k), dtype=int) for i in np.arange(0, n): d_vec = D[i, :] # i-th row v = np.argsort(d_vec) # 昇順にソートした配列のインデックス neigh_idx[i, :] = v[1:k + 1] # 距離が短い順にk個選ぶ(自分を除く) neigh_dist[i, :] = d_vec[neigh_idx[i, :]] return neigh_dist, neigh_idx
def setUpMod(self, mode='rnd'): np.random.seed(626) if mode == 'rnd': points = 200 dim = 500 self.vector = 99. * (np.random.rand(points, dim) - 0.5) self.label = np.random.randint(0, 5, points) self.dist = euclidean_distance(self.vector) #self.dist /= (self.dist.max() + 1e-12) elif mode == 'toy': # SNN (k=2) ground truth calculated by hand for this toy example self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9]) self.snn_dist_truth = squareform( [.5, .5, .5, .5, .5, .5, 0., 0., .5, .5]) self.vector = None self.label = None
def setUpMod(self, mode='rnd'): np.random.seed(626) if mode == 'rnd': points = 200 # 200 dim = 500 # 500 self.vector = 99. * (np.random.rand(points, dim) - 0.5) self.label = np.random.randint(0, 5, points) self.dist = euclidean_distance(self.vector) elif mode == 'toy': # LS/NICDM ground truth calculated in spreadsheet for toy example self.dist = squareform([.2, .1, .8, .4, .3, .5, .7, 1., .6, .9]) self.ls_dist_truth = squareform([ 0.486582881, 0.1535182751, 0.9816843611, 0.7364028619, 0.6321205588, 0.6471339185, 0.9342714714, 0.9844961464, 0.8646647168, 0.8150186001 ]) self.nicdm_dist_truth = squareform([ 0.310029690448236, 0.173311865721368, 0.769089007390428, 0.438448192970227, 0.402740381783397, 0.37233361467179, 0.594335892341949, 0.832563272714335, 0.569560910033398, 0.473903322836619 ]) self.vector = None self.label = None
def generate_triplets(X, n_inlier, n_outlier, n_random, fast_trimap=True, weight_adj=True, verbose=True, hub='mp'): n, dim = X.shape if dim > 100: X = TruncatedSVD(n_components=100, random_state=0).fit_transform(X) dim = 100 exact = n <= 10000 n_extra = min(max(n_inlier, 150), n) # if hub == 'mp_app': # # D = euclidean_distance(X) # n = X.shape[0] # D_mp = SuQHR(n_samples=n-1).fit_transform(X) # print("kjk", D_mp.shape) # # # make knn graph # distances, nbrs = KNN_Info(D_mp, n_extra) # # if verbose: # print("hubness reduction with {}".format(hub)) if hub == 'mp1': # hubness reductionをtriplet選択のみに使用 neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='distance', hubness='mutual_proximity', hubness_params={'method': 'normal'}) nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) # distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) flag = nbrs.tolist() D = euclidean_distance(X) D = np.array([D[i][flag[i]] for i in range(D.shape[0])]) distances = D if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp2': # 類似度Pを1−Dmpにする D = euclidean_distance(X) D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi( D=D, metric='distance') # make knn graph distances, nbrs = KNN_Info(D_mp, n_extra) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp3_gauss': # secondary distanceで類似度を計算 D = euclidean_distance(X) D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi( D=D, metric='distance') del D gc.collect() # make knn graph distances, nbrs = KNN_Info(D_mp, n_extra) # neigbour_graph = sknn(X, n_neighbors=n_extra, mode='distance') # nbrs = neigbour_graph.indices.astype(int).reshape((X.shape[0], n_extra)) # distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp3_emp': # secondary distanceで類似度を計算 D = euclidean_distance(X) D_mp = hub_toolbox.global_scaling._mutual_proximity_empiric_full( D=D, metric='distance') # make knn graph # distances, nbrs = KNN_Info(D_mp, n_extra) neigbour_graph = k if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp4': # 謎 neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='distance', hubness='mutual_proximity', hubness_params={'method': 'normal'}) nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'ls1': neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='distance', hubness='local_scaling') nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) # distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) flag = nbrs.tolist() D = euclidean_distance(X) D = np.array([D[i][flag[i]] for i in range(D.shape[0])]) distances = D if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'ls2': D = euclidean_distance(X) D_ls = hub_toolbox.local_scaling.local_scaling(D=D, k=10, metric='distance') # make knn graph distances, nbrs = KNN_Info(D_ls, n_extra) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'dsl': neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='connectivity', hubness='dsl') nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) # flag = neigbour_graph.data.reshape((X.shape[0], n_extra)) flag = nbrs.tolist() D = euclidean_distance(X) D = np.array([D[i][flag[i]] for i in range(D.shape[0])]) distances = D # D = np.empty((X.shape[0], n_extra, dtype=np.float64) # for i in range(X.shape[0]): # for j in range(n_extra): # D[i, j] = euclid_dist(X[i, :], X[nbrs[i][j]]) # np.sqrt(np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :]) ** 2)) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mutual': # D = euclidean_distance(X) # # make knn graph # _, nbrs = KNN_Info(D_mp, n_extra) knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X) distances, nbrs = knn_tree.kneighbors(X) nbrs = make_mutual(nbrs) # a = nbrs == X.shape[0] + 1 # print(a) elif hub == 'SNN1' or hub == 'SNN2': D = euclidean_distance(X) D_snn = hub_toolbox.shared_neighbors.shared_nearest_neighbors( D=D, metric='distance') # snn = shared_neighbors(k=10, metric='euclidean') # D_snn = snn.fit_tr(X) # make knn graph distances, nbrs = KNN_Info(D_snn, n_extra) if verbose: print("hubness reduction with {}".format(hub)) elif exact: # do exact knn search knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X) distances, nbrs = knn_tree.kneighbors(X) # print(nbrs) elif fast_trimap: # use annoy tree = AnnoyIndex(dim, metric='euclidean') for i in range(n): tree.add_item(i, X[i, :]) tree.build(10) nbrs = np.empty((n, n_extra), dtype=np.int64) distances = np.empty((n, n_extra), dtype=np.float64) dij = np.empty(n_extra, dtype=np.float64) for i in range(n): nbrs[i, :] = tree.get_nns_by_item(i, n_extra) for j in range(n_extra): dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :]) sort_indices = np.argsort(dij) nbrs[i, :] = nbrs[i, sort_indices] # for j in range(n_extra): # distances[i,j] = tree.get_distance(i, nbrs[i,j]) distances[i, :] = dij[sort_indices] else: n_bf = 10 n_extra += n_bf knn_tree = knn(n_neighbors=n_bf, algorithm='auto').fit(X) _, nbrs_bf = knn_tree.kneighbors(X) nbrs = np.empty((n, n_extra), dtype=np.int64) nbrs[:, :n_bf] = nbrs_bf tree = AnnoyIndex(dim, metric='euclidean') for i in range(n): tree.add_item(i, X[i, :]) tree.build(100) distances = np.empty((n, n_extra), dtype=np.float64) dij = np.empty(n_extra, dtype=np.float64) for i in range(n): nbrs[i, n_bf:] = tree.get_nns_by_item(i, n_extra - n_bf) unique_nn = np.unique(nbrs[i, :]) n_unique = len(unique_nn) nbrs[i, :n_unique] = unique_nn for j in range(n_unique): dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :]) sort_indices = np.argsort(dij[:n_unique]) nbrs[i, :n_unique] = nbrs[i, sort_indices] distances[i, :n_unique] = dij[sort_indices] if verbose: print("found nearest neighbors") # if hub == 'ls': # # sig = np.array([1.]*X.shape[0]) # else: if hub == 'mp2': P = 1 - distances # (n, k) # elif hub == 'mp3': # sig = np.median(D_mp[np.triu_indices(D_mp.shape[0], k=1)]) # sig = np.array([sig] * D_mp.shape[0]) # P = find_p(distances, sig, nbrs) else: sig = np.maximum(np.mean(distances[:, 10:20], axis=1), 1e-20) # scale parameter P = find_p(distances, sig, nbrs) # if hub == 'ls': # P = -np.log(P) # P = np.sqrt(P) # P = 1 - P triplets = sample_knn_triplets(P, nbrs, n_inlier, n_outlier) print("tri_shape", triplets[0], triplets[0][2]) n_triplets = triplets.shape[0] # if hub == 'mp': # outlier_dist # if not hub == 'mp': # outlier_dist = np.empty(n_triplets, dtype=np.float64) # if hub == 'mp': # for t in range(n_triplets): # outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]] # el if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2': pass elif hub == 'mp3_gauss' or hub == 'mp3_emp': for t in range(n_triplets): outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]] elif hub == 'SNN2': for t in range(n_triplets): outlier_dist[t] = D_snn[triplets[t][0], triplets[t][2]] elif exact or not fast_trimap: for t in range(n_triplets): outlier_dist[t] = np.sqrt( np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :])**2)) else: for t in range(n_triplets): outlier_dist[t] = euclid_dist(X[triplets[t, 0], :], X[triplets[t, 2], :]) # outlier_dist[t] = tree.get_distance(triplets[t,0], triplets[t,2]) if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2': if hub == 'SNN1': D_mp = D_snn elif hub == 'ls2': D_mp = D_ls n_triplets = triplets.shape[0] weights = np.empty(n_triplets, dtype=np.float64) print("P and triplets' shape", triplets) P = 1 - D_mp # (n, n) for t in range(n_triplets): i = triplets[t, 0] p_sim = P[i, triplets[t, 1]] p_out = P[i, triplets[t, 2]] if p_out < 1e-20: p_out = 1e-20 weights[t] = p_sim / p_out else: weights = find_weights(triplets, P, nbrs, outlier_dist, sig) if hub == 'weight': deg, mean_deg, var_deg = calculate_deg(nbrs) var_deg = max(var_deg, 1e-20) # hubness_score = (deg - mean_deg) / var_deg # hs_med = np.mean(hubness_score) hs_med = np.median(deg) hub_weights = np.exp(-deg / hs_med) # hub_weights = np.exp(- hubness_score) # print(hubness_score) m = hub_weights.shape[0] l = n_inlier * n_outlier for i in range(m): for j in range(l): weights[i * l:i * l + j] = hub_weights[i] * weights[i * l:i * l + j] print('out_dist: ', outlier_dist) if n_random > 0: if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2': rand_triplets = sample_random_triplets(X, n_random, P=P) # P: (n, n) else: rand_triplets = sample_random_triplets(X, n_random, sig=sig) rand_weights = rand_triplets[:, -1] rand_triplets = rand_triplets[:, :-1].astype(np.int64) triplets = np.vstack((triplets, rand_triplets)) weights = np.hstack((weights, rand_weights)) weights /= np.max(weights) weights += 0.0001 if weight_adj: if not isinstance(weight_adj, (int, float)): weight_adj = 400.0 weights = np.log(1 + weight_adj * weights) weights /= np.max(weights) return (triplets, weights)
def mantel_test(X, L, embed, describe=True): sss = StratifiedShuffleSplit(n_splits=50, test_size=1000, random_state=0) sss.get_n_splits(X, L) label_type = list(set(L)) r_lst = np.array([]) p_lst = np.array([]) for _, idx in sss.split(X, L): # print('Index: ', idx) # X_test = X[idx] # y_train = X_high, L_hl = X[idx], L[idx] X_low = embed[idx] # print(X_high.shape, L_high.shape) # print(X_low.shape, L_low.shape) label_idx = [] for _, i in enumerate(label_type): l_idx = np.where(L_hl == i) label_idx.append(l_idx) # print(label_type) # label_idx X_high_lst = [] X_low_lst = [] # for _, i in enumerate(label_type): # X_high_lst.append(X_high[label_idx[i]]) for i, _ in enumerate(label_type): centroid = np.mean(X_high[label_idx[i]], axis=0) # print(centroid) X_high_lst.append(centroid) # print(centroid.shape) # X_high_lst.append((X_high[label_idx[i]] - centroid)) # X_high_lst[label_idx[i]] = np.sqrt(np.linalg.norm(X_high[label_idx[i]] - centroid, ord=2)) # for _, i in enumerate(label_type): centroid = np.mean(X_low[label_idx[i]], axis=0) X_low_lst.append(centroid) # print(centroid.shape) # X_high_lst.append((X_low[label_idx[i]] - centroid)) # X_low_lst[label_idx[i]] = np.sqrt(np.linalg.norm(X_low[label_idx[i]] - centroid, ord=2)) # print(X_low_lst[0].shape, centroid.shape) D_high = euclidean_distance(X_high_lst) D_low = euclidean_distance(X_low_lst) # print(D_high, D_low) r, p, z = Mantel.test(D_high, D_low, perms=10000, method='pearson', tail='upper') r_lst = np.append(r_lst, r) p_lst = np.append(p_lst, p) if describe == True: print(p_lst) print(pd.DataFrame(pd.Series(r_lst.ravel()).describe()).transpose()) return r_lst, p_lst
iter_n = 5 seed_lst = random.sample(range(100), k=iter_n) print(seed_lst) for i in range(iter_n): # seed = random.randint(0, 100) seed = seed_lst[i] fit = umap.UMAP(init='random', metric='euclidean', n_neighbors=k, n_epochs=2000, random_state=seed, min_dist=0.5) u_org = fit.fit_transform(data) D = euclidean_distance(data) fit = umap.UMAP(init='random', n_neighbors=k, metric='precomputed', n_epochs=2000, random_state=seed, min_dist=0.5) u_hub = fit.fit_transform(D) # neigbour_graph = kneighbors_graph(data, algorithm='hnsw', algorithm_params={'n_candidates': 100}, n_neighbors=k, # mode='distance', hubness='mutual_proximity', # hubness_params={'method': 'normal'}) # u = fit.fit_transform(D_mp) plt.scatter(u_org[:, 0], u_org[:, 1], c=labels, cmap="Spectral", s=10) plt.show()
def test_euclidean_dist_equal_to_scipy_cdist_eucl(self): eucl_dist = euclidean_distance(self.vectors) eucl_dist_cdist = cdist(self.vectors, self.vectors, 'euclidean') return np.testing.assert_array_almost_equal(eucl_dist, eucl_dist_cdist, decimal=7)
def setUp(self): """Hubness truth: S_k=5, skewness calculated with bias""" np.random.seed(123) self.X = np.random.rand(100, 50) self.D = euclidean_distance(self.X) self.verbose = 1