def test_neighbors_dexter(hubness_param, metric): hubness, param = hubness_param X, y = load_dexter() # Hubness in standard spaces hub = Hubness(k=10, metric=metric) hub.fit(X) k_skew_orig = hub.score() # Hubness in secondary distance spaces (after hub. red.) graph = kneighbors_graph(X, n_neighbors=10, metric=metric, hubness=hubness, hubness_params=param) hub = Hubness(k=10, metric='precomputed') hub.fit(graph) k_skew_hr = hub.score(has_self_distances=True) assert k_skew_hr < k_skew_orig * 8/10,\ f'k-occurrence skewness was not reduced by at least 20% for dexter with {hubness}'
def hub_reduction( input_points, methods={ 'nothing': (None, None), 'mp_normal': ('mp', { 'method': 'normal' }), 'ls': ('ls', None), 'ls_nicdm': ('ls', { 'method': 'nicdm' }), 'dsl': ('dsl', None) }, k=k): samples_reduced = dict() for method_name, (hubness, hubness_params) in tqdm(methods.items()): samples_reduced[method_name] = kneighbors_graph( input_points, n_neighbors=k, hubness=hubness, hubness_params=hubness_params) return samples_reduced
def generate_triplets(X, n_inlier, n_outlier, n_random, fast_trimap=True, weight_adj=True, verbose=True, hub='mp'): n, dim = X.shape if dim > 100: X = TruncatedSVD(n_components=100, random_state=0).fit_transform(X) dim = 100 exact = n <= 10000 n_extra = min(max(n_inlier, 150), n) # if hub == 'mp_app': # # D = euclidean_distance(X) # n = X.shape[0] # D_mp = SuQHR(n_samples=n-1).fit_transform(X) # print("kjk", D_mp.shape) # # # make knn graph # distances, nbrs = KNN_Info(D_mp, n_extra) # # if verbose: # print("hubness reduction with {}".format(hub)) if hub == 'mp1': # hubness reductionをtriplet選択のみに使用 neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='distance', hubness='mutual_proximity', hubness_params={'method': 'normal'}) nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) # distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) flag = nbrs.tolist() D = euclidean_distance(X) D = np.array([D[i][flag[i]] for i in range(D.shape[0])]) distances = D if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp2': # 類似度Pを1−Dmpにする D = euclidean_distance(X) D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi( D=D, metric='distance') # make knn graph distances, nbrs = KNN_Info(D_mp, n_extra) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp3_gauss': # secondary distanceで類似度を計算 D = euclidean_distance(X) D_mp = hub_toolbox.global_scaling.mutual_proximity_gaussi( D=D, metric='distance') del D gc.collect() # make knn graph distances, nbrs = KNN_Info(D_mp, n_extra) # neigbour_graph = sknn(X, n_neighbors=n_extra, mode='distance') # nbrs = neigbour_graph.indices.astype(int).reshape((X.shape[0], n_extra)) # distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp3_emp': # secondary distanceで類似度を計算 D = euclidean_distance(X) D_mp = hub_toolbox.global_scaling._mutual_proximity_empiric_full( D=D, metric='distance') # make knn graph # distances, nbrs = KNN_Info(D_mp, n_extra) neigbour_graph = k if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mp4': # 謎 neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='distance', hubness='mutual_proximity', hubness_params={'method': 'normal'}) nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'ls1': neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='distance', hubness='local_scaling') nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) # distances = neigbour_graph.data.reshape((X.shape[0], n_extra)) flag = nbrs.tolist() D = euclidean_distance(X) D = np.array([D[i][flag[i]] for i in range(D.shape[0])]) distances = D if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'ls2': D = euclidean_distance(X) D_ls = hub_toolbox.local_scaling.local_scaling(D=D, k=10, metric='distance') # make knn graph distances, nbrs = KNN_Info(D_ls, n_extra) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'dsl': neigbour_graph = kneighbors_graph(X, n_neighbors=n_extra, mode='connectivity', hubness='dsl') nbrs = neigbour_graph.indices.astype(int).reshape( (X.shape[0], n_extra)) # flag = neigbour_graph.data.reshape((X.shape[0], n_extra)) flag = nbrs.tolist() D = euclidean_distance(X) D = np.array([D[i][flag[i]] for i in range(D.shape[0])]) distances = D # D = np.empty((X.shape[0], n_extra, dtype=np.float64) # for i in range(X.shape[0]): # for j in range(n_extra): # D[i, j] = euclid_dist(X[i, :], X[nbrs[i][j]]) # np.sqrt(np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :]) ** 2)) if verbose: print("hubness reduction with {}".format(hub)) elif hub == 'mutual': # D = euclidean_distance(X) # # make knn graph # _, nbrs = KNN_Info(D_mp, n_extra) knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X) distances, nbrs = knn_tree.kneighbors(X) nbrs = make_mutual(nbrs) # a = nbrs == X.shape[0] + 1 # print(a) elif hub == 'SNN1' or hub == 'SNN2': D = euclidean_distance(X) D_snn = hub_toolbox.shared_neighbors.shared_nearest_neighbors( D=D, metric='distance') # snn = shared_neighbors(k=10, metric='euclidean') # D_snn = snn.fit_tr(X) # make knn graph distances, nbrs = KNN_Info(D_snn, n_extra) if verbose: print("hubness reduction with {}".format(hub)) elif exact: # do exact knn search knn_tree = knn(n_neighbors=n_extra, algorithm='auto').fit(X) distances, nbrs = knn_tree.kneighbors(X) # print(nbrs) elif fast_trimap: # use annoy tree = AnnoyIndex(dim, metric='euclidean') for i in range(n): tree.add_item(i, X[i, :]) tree.build(10) nbrs = np.empty((n, n_extra), dtype=np.int64) distances = np.empty((n, n_extra), dtype=np.float64) dij = np.empty(n_extra, dtype=np.float64) for i in range(n): nbrs[i, :] = tree.get_nns_by_item(i, n_extra) for j in range(n_extra): dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :]) sort_indices = np.argsort(dij) nbrs[i, :] = nbrs[i, sort_indices] # for j in range(n_extra): # distances[i,j] = tree.get_distance(i, nbrs[i,j]) distances[i, :] = dij[sort_indices] else: n_bf = 10 n_extra += n_bf knn_tree = knn(n_neighbors=n_bf, algorithm='auto').fit(X) _, nbrs_bf = knn_tree.kneighbors(X) nbrs = np.empty((n, n_extra), dtype=np.int64) nbrs[:, :n_bf] = nbrs_bf tree = AnnoyIndex(dim, metric='euclidean') for i in range(n): tree.add_item(i, X[i, :]) tree.build(100) distances = np.empty((n, n_extra), dtype=np.float64) dij = np.empty(n_extra, dtype=np.float64) for i in range(n): nbrs[i, n_bf:] = tree.get_nns_by_item(i, n_extra - n_bf) unique_nn = np.unique(nbrs[i, :]) n_unique = len(unique_nn) nbrs[i, :n_unique] = unique_nn for j in range(n_unique): dij[j] = euclid_dist(X[i, :], X[nbrs[i, j], :]) sort_indices = np.argsort(dij[:n_unique]) nbrs[i, :n_unique] = nbrs[i, sort_indices] distances[i, :n_unique] = dij[sort_indices] if verbose: print("found nearest neighbors") # if hub == 'ls': # # sig = np.array([1.]*X.shape[0]) # else: if hub == 'mp2': P = 1 - distances # (n, k) # elif hub == 'mp3': # sig = np.median(D_mp[np.triu_indices(D_mp.shape[0], k=1)]) # sig = np.array([sig] * D_mp.shape[0]) # P = find_p(distances, sig, nbrs) else: sig = np.maximum(np.mean(distances[:, 10:20], axis=1), 1e-20) # scale parameter P = find_p(distances, sig, nbrs) # if hub == 'ls': # P = -np.log(P) # P = np.sqrt(P) # P = 1 - P triplets = sample_knn_triplets(P, nbrs, n_inlier, n_outlier) print("tri_shape", triplets[0], triplets[0][2]) n_triplets = triplets.shape[0] # if hub == 'mp': # outlier_dist # if not hub == 'mp': # outlier_dist = np.empty(n_triplets, dtype=np.float64) # if hub == 'mp': # for t in range(n_triplets): # outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]] # el if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2': pass elif hub == 'mp3_gauss' or hub == 'mp3_emp': for t in range(n_triplets): outlier_dist[t] = D_mp[triplets[t][0], triplets[t][2]] elif hub == 'SNN2': for t in range(n_triplets): outlier_dist[t] = D_snn[triplets[t][0], triplets[t][2]] elif exact or not fast_trimap: for t in range(n_triplets): outlier_dist[t] = np.sqrt( np.sum((X[triplets[t, 0], :] - X[triplets[t, 2], :])**2)) else: for t in range(n_triplets): outlier_dist[t] = euclid_dist(X[triplets[t, 0], :], X[triplets[t, 2], :]) # outlier_dist[t] = tree.get_distance(triplets[t,0], triplets[t,2]) if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2': if hub == 'SNN1': D_mp = D_snn elif hub == 'ls2': D_mp = D_ls n_triplets = triplets.shape[0] weights = np.empty(n_triplets, dtype=np.float64) print("P and triplets' shape", triplets) P = 1 - D_mp # (n, n) for t in range(n_triplets): i = triplets[t, 0] p_sim = P[i, triplets[t, 1]] p_out = P[i, triplets[t, 2]] if p_out < 1e-20: p_out = 1e-20 weights[t] = p_sim / p_out else: weights = find_weights(triplets, P, nbrs, outlier_dist, sig) if hub == 'weight': deg, mean_deg, var_deg = calculate_deg(nbrs) var_deg = max(var_deg, 1e-20) # hubness_score = (deg - mean_deg) / var_deg # hs_med = np.mean(hubness_score) hs_med = np.median(deg) hub_weights = np.exp(-deg / hs_med) # hub_weights = np.exp(- hubness_score) # print(hubness_score) m = hub_weights.shape[0] l = n_inlier * n_outlier for i in range(m): for j in range(l): weights[i * l:i * l + j] = hub_weights[i] * weights[i * l:i * l + j] print('out_dist: ', outlier_dist) if n_random > 0: if hub == 'mp2' or hub == 'SNN1' or hub == 'ls2': rand_triplets = sample_random_triplets(X, n_random, P=P) # P: (n, n) else: rand_triplets = sample_random_triplets(X, n_random, sig=sig) rand_weights = rand_triplets[:, -1] rand_triplets = rand_triplets[:, :-1].astype(np.int64) triplets = np.vstack((triplets, rand_triplets)) weights = np.hstack((weights, rand_weights)) weights /= np.max(weights) weights += 0.0001 if weight_adj: if not isinstance(weight_adj, (int, float)): weight_adj = 400.0 weights = np.log(1 + weight_adj * weights) weights /= np.max(weights) return (triplets, weights)
def viz_analysis( adata, do_norm, norm_scale, do_log, do_pca, n_clusters, metric, weighted, # weighted adjmat for louvain/leiden clustering ? seed, n_comps, clustering_algo, ): hubness_methods = { 'nothing': (None, None), 'mp_normal': ('mp', { 'method': 'normal' }), 'ls': ('ls', None), 'ls_nicdm': ('ls', { 'method': 'nicdm' }), 'dsl': ('dsl', None) } start0 = time.time() ### preprocess, prepare clustering input ### if type(do_norm) is str: adata.X = scipy.sparse.csr_matrix(adata.X) if do_norm == 'seurat': recipe_seurat(adata, do_log, norm_scale) # print(f'\t\tseurat norm retained {adata.X.shape[1]} genes') elif do_norm == 'duo': recipe_duo(adata, do_log, renorm=norm_scale) # print(f'\t\tduo norm retained {adata.X.shape[1]} genes') else: raise ValueError("do_norm not in duo, seurat") if scipy.sparse.issparse(adata.X): adata.X = adata.X.toarray() if do_log and not (type(do_norm) is str): # print('\t\tlog_transformed data') sc.pp.log1p(adata) if do_pca: use_rep = 'X_pca' sc.tl.pca(adata, n_comps=min(adata.X.shape[1] - 1, min(len(adata.X) - 1, 500))) original1 = adata.obsm['X_pca'] sc.tl.pca(adata, n_comps=min(adata.X.shape[1] - 1, min(len(adata.X) - 1, n_comps))) X = adata.obsm['X_pca'] else: # print('pca not done!') use_rep = 'X' X = adata.X n_neighbors = int(np.sqrt(X.shape[0])) print('\t\t\tPreprocessing done:', round((time.time() - start0) / 60, 2), 'mn') ### Hub reduction and clustering ### start = time.time() all_adata = dict() for kernel in ['umap', 'gauss']: all_adata[kernel] = adata.copy() try: sc.pp.neighbors(all_adata[kernel], n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=kernel) except: sc.pp.neighbors(all_adata[kernel], n_neighbors=n_neighbors + 1, metric=metric, use_rep=use_rep, method=kernel, knn=False) G, weights = generate_clustering_inputs(X=X, metric=metric, n_neighbors=n_neighbors, weighted=weighted, seed=seed, hubness=None, hubness_params=None) resol, weighted = getNclusters(all_adata[kernel], G, n_clusters=n_clusters, seed=seed, clustering_algo=clustering_algo, flavor='scanpy', weights=weights) if clustering_algo == "leiden": sc.tl.leiden(all_adata[kernel], resolution=resol, use_weights=weighted, random_state=seed) elif clustering_algo == "louvain": sc.tl.louvain(all_adata[kernel], resolution=resol, use_weights=weighted, random_state=seed) sc.pl.paga(all_adata[kernel], show=False, random_state=seed, plot=False) for method_name, (hubness, hubness_params) in hubness_methods.items(): all_adata[method_name] = adata.copy() all_adata[method_name].obsp['connectivities'] = kneighbors_graph( X, n_neighbors=n_neighbors, hubness=hubness, hubness_params=hubness_params, metric=metric, mode="connectivity") all_adata[method_name].obsp['distances'] = kneighbors_graph( X, n_neighbors=n_neighbors, hubness=hubness, hubness_params=hubness_params, metric=metric, mode="distance") all_adata[method_name].uns['neighbors'] = { 'connectivities_key': 'connectivities', 'distances_key': 'distances', 'params': { 'n_neighbors': n_neighbors, 'method': 'umap', 'metric': metric } } G, weights = generate_clustering_inputs(X=X, metric=metric, n_neighbors=n_neighbors, weighted=weighted, seed=seed, hubness=hubness, hubness_params=hubness_params) resol, weighted = getNclusters(all_adata[method_name], G, n_clusters=n_clusters, seed=seed, clustering_algo=clustering_algo, flavor='base', weights=weights) if clustering_algo == "louvain": clus = np.array( louvain.find_partition( graph=G, partition_type=louvain.RBConfigurationVertexPartition, weights=weights, resolution_parameter=resol, seed=seed).membership) all_adata[method_name].obs['louvain'] = pd.Categorical( values=clus.astype('U'), categories=natsorted(map(str, np.unique(clus))), ) elif clustering_algo == "leiden": clus = np.array( leidenalg.find_partition( graph=G, partition_type=leidenalg.RBConfigurationVertexPartition, weights=weights, resolution_parameter=resol, seed=seed).membership) all_adata[method_name].obs['leiden'] = pd.Categorical( values=clus.astype('U'), categories=natsorted(map(str, np.unique(clus))), ) # original0 = adata.X # original2 = adata.obsm['X_pca'][:, :2] print('\t\t\tHubness and PAGA full pipeline:', round((time.time() - start) / 60, 2), 'mn') ### tSNE embedding ### start = time.time() tsne = sklearn.manifold.TSNE(n_components=2, metric='precomputed', random_state=seed, perplexity=50.0) q_tsne = np.empty((2, len(all_adata.keys()))) for idx, method_name in enumerate(all_adata.keys()): all_adata[method_name].obsm['X_tsne'] = tsne.fit_transform( all_adata[method_name].obsp['distances'].toarray()) # q_tsne[0, idx] = QDM(original0, all_adata[method_name].obsm['X_tsne'], metric) q_tsne[0, idx] = QDM(original1, all_adata[method_name].obsm['X_tsne'], metric) # q_tsne[2, idx] = QDM(original2, all_adata[method_name].obsm['X_tsne'], metric) # q_tsne[3, idx] = QNP(original0, all_adata[method_name].obsm['X_tsne'], metric, n_neighbors) q_tsne[1, idx] = QNP(original1, all_adata[method_name].obsm['X_tsne'], metric, n_neighbors) # q_tsne[5, idx] = QNP(original2, all_adata[method_name].obsm['X_tsne'], metric, n_neighbors) print('\t\t\ttSNE embedding pipeline:', round((time.time() - start) / 60, 2), 'mn') ### UMAP embedding ### start = time.time() umap = UMAP(n_components=2, metric='precomputed', random_state=seed) q_umap = np.empty((2, len(all_adata.keys()))) for idx, method_name in enumerate(all_adata.keys()): all_adata[method_name].obsm['X_umap_'] = umap.fit_transform( all_adata[method_name].obsp['distances'].toarray()) # q_umap[0, idx] = QDM(original0, all_adata[method_name].obsm['X_umap_'], metric) q_umap[0, idx] = QDM(original1, all_adata[method_name].obsm['X_umap_'], metric) # q_umap[2, idx] = QDM(original2, all_adata[method_name].obsm['X_umap_'], metric) # q_umap[3, idx] = QNP(original0, all_adata[method_name].obsm['X_umap_'], metric, n_neighbors) q_umap[1, idx] = QNP(original1, all_adata[method_name].obsm['X_umap_'], metric, n_neighbors) # q_umap[5, idx] = QNP(original2, all_adata[method_name].obsm['X_umap_'], metric, n_neighbors) print('\t\t\tUMAP embedding pipeline:', round((time.time() - start) / 60, 2), 'mn') ### PAGA embedding ### start = time.time() q_paga_umap = np.empty((2, len(all_adata.keys()))) for idx, method_name in enumerate(all_adata.keys()): sc.tl.paga(all_adata[method_name], groups=clustering_algo) sc.pl.paga(all_adata[method_name], show=False, random_state=seed, plot=False) sc.tl.umap(all_adata[method_name], init_pos="paga", random_state=seed) # q_paga_umap[0, idx] = QDM(original0, all_adata[method_name].obsm['X_umap'], metric) q_paga_umap[0, idx] = QDM(original1, all_adata[method_name].obsm['X_umap'], metric) # q_paga_umap[2, idx] = QDM(original2, all_adata[method_name].obsm['X_umap'], metric) # q_paga_umap[3, idx] = QNP(original0, all_adata[method_name].obsm['X_umap'], metric, n_neighbors) q_paga_umap[1, idx] = QNP(original1, all_adata[method_name].obsm['X_umap'], metric, n_neighbors) # q_paga_umap[5, idx] = QNP(original2, all_adata[method_name].obsm['X_umap'], metric, n_neighbors) print('\t\t\tPAGA+UMAP embedding pipeline:', round((time.time() - start) / 60, 2), 'mn') ### Save ### np.savetxt(get_res_path(fname) + "_tsne_q.csv", q_tsne, delimiter=',') np.savetxt(get_res_path(fname) + "_umap_q.csv", q_umap, delimiter=',') np.savetxt(get_res_path(fname) + "_paga_q.csv", q_paga_umap, delimiter=',') print('\t\t\tFull pipeline:', round((time.time() - start0) / 60, 2), 'mn')
def generator_from_index(X, Y, index_path, k, batch_size, search_k=-1, precompute=True, verbose=1, type='tri', knn='MP'): if k >= X.shape[0] - 1: raise Exception('''k value greater than or equal to (num_rows - 1) (k={}, rows={}). Lower k to a smaller value.'''.format(k, X.shape[0])) if batch_size > X.shape[0]: raise Exception('''batch_size value larger than num_rows in dataset (batch_size={}, rows={}). Lower batch_size to a smaller value.'''.format(batch_size, X.shape[0])) if Y is None: if precompute: if verbose > 0: print('Extracting KNN from index') if knn == 'MP': if verbose > 0: print('Making MP-based KNN') # D = euclidean_distance(X) # D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric( # D=D, metric='distance') neigbour_graph = kneighbors_graph( X, n_neighbors=k, hubness='mutual_proximity', hubness_params={'method': 'normal'}) # neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness=None) neighbour_matrix = neigbour_graph.indices.reshape( (X.shape[0], k)) else: neighbour_matrix = extract_knn(X, index_path, k=k, search_k=search_k, verbose=verbose) # neighbour_matrix = np.asarray(neighbour_matrix, dtype=np.int32) print('neighbour_matrix: ', neighbour_matrix.shape) if knn == 'Mutual': if verbose > 0: print('Making KNN mutual') neighbour_matrix = make_mutual(neighbour_matrix) # print('Mutual Knn: ', neighbour_matrix[0]) if type == 'quad': return KnnQuadrupletGenerator(X, neighbour_matrix, batch_size=batch_size) if type == 'tri': return KnnTripletGenerator(X, neighbour_matrix, batch_size=batch_size) else: index = AnnoyIndex(X.shape[1]) index.load(index_path) return AnnoyTripletGenerator(X, index, k=k, batch_size=batch_size, search_k=search_k) else: if precompute: if verbose > 0: print('Extracting KNN from index') neighbour_matrix = extract_knn(X, index_path, k=k, search_k=search_k, verbose=verbose) return LabeledKnnTripletGenerator(X, Y, neighbour_matrix, batch_size=batch_size) else: index = AnnoyIndex(X.shape[1]) index.load(index_path) return LabeledAnnoyTripletGenerator(X, Y, index, k=k, batch_size=batch_size, search_k=search_k)
# d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # # vectors = vectors[:10000, :] # # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # # vectors = mnist.data # # vectors = vectors[:10000, :] # # d_mle = hub_toolbox.intrinsic_dimension.intrinsic_dimension(vectors) # D = euclidean_distance(vectors) # # S_k, _, _ = hub_toolbox.hubness.hubness(D=D, k=5, metric='distance') # D_mp = hub_toolbox.global_scaling.mutual_proximity_empiric( # D=D, metric='distance') # S_k_mp, _, _ = hub_toolbox.hubness.hubness(D=D_mp, k=5, metric='distance') # # print(S_k, S_k_mp) from skhubness.data import load_dexter X, y = load_dexter() from skhubness import Hubness hub = Hubness(k=10, metric='cosine') hub.fit(X) k_skew = hub.score() print(f'Skewness = {k_skew:.3f}') from skhubness.neighbors import kneighbors_graph k = 5 # neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness='mutual_proximity') neigbor_graph = kneighbors_graph(X, n_neighbors=k, hubness=None) neighbor_matrix = neigbor_graph.indices.reshape((X.shape[0], k)) print(neighbor_matrix)
def ti_analysis(adata, true_labels, do_norm, norm_scale, do_log, do_pca, n_clusters, metric, weighted, # weighted adjmat for louvain/leiden clustering ? seed, n_comps, clustering_algo, n_iter, bootstrap_size): hubness_methods = {'nothing': (None, None), 'mp_normal': ('mp', {'method': 'normal'}), 'ls': ('ls', None), 'ls_nicdm': ('ls', {'method': 'nicdm'}), 'dsl': ('dsl', None)} start = time.time() ### preprocess, prepare clustering input ### if type(do_norm) is str: adata.X = scipy.sparse.csr_matrix(adata.X) if do_norm == 'seurat': recipe_seurat(adata, do_log, norm_scale) # print(f'\t\tseurat norm retained {adata.X.shape[1]} genes') elif do_norm == 'duo': recipe_duo(adata, do_log, renorm=norm_scale) # print(f'\t\tduo norm retained {adata.X.shape[1]} genes') else: raise ValueError("do_norm not in 'duo', seurat'") if scipy.sparse.issparse(adata.X): adata.X = adata.X.toarray() if do_log and not(type(do_norm) is str): # print('\t\tlog_transformed data') sc.pp.log1p(adata) if do_pca: use_rep = 'X_pca' sc.tl.pca(adata, n_comps=min(adata.X.shape[1]-1, min(len(adata.X)-1, n_comps))) X = adata.obsm['X_pca'] else: # print('pca not done!') use_rep = 'X' X = adata.X n_neighbors = int(np.sqrt(X.shape[0])) print('\t\t\tPreprocessing done:', round((time.time()-start)/60, 2), 'mn') start = time.time() ### clustering and PAGA step ### all_adata = dict() for kernel in ['umap', 'gauss']: all_adata[kernel] = adata.copy() try: sc.pp.neighbors(all_adata[kernel], n_neighbors=n_neighbors+1, metric=metric, use_rep=use_rep, method=kernel) except: sc.pp.neighbors(all_adata[kernel], n_neighbors=n_neighbors+1, metric=metric, use_rep=use_rep, method=kernel, knn=False) G, weights = generate_clustering_inputs(X=X, metric=metric, n_neighbors=n_neighbors, weighted=weighted, seed=seed, hubness=None, hubness_params=None) resol, weighted = getNclusters(all_adata[kernel], G, n_clusters=n_clusters, seed=seed, clustering_algo=clustering_algo, flavor='scanpy', weights=weights) if clustering_algo == "leiden": sc.tl.leiden(all_adata[kernel], resolution=resol, use_weights=weighted, random_state=seed) sc.tl.paga(all_adata[kernel], groups="leiden") elif clustering_algo == "louvain": sc.tl.louvain(all_adata[kernel], resolution=resol, use_weights=weighted, random_state=seed) sc.tl.paga(all_adata[kernel], groups="louvain") for method_name, (hubness, hubness_params) in hubness_methods.items(): all_adata[method_name] = adata.copy() all_adata[method_name].obsp['connectivities'] = kneighbors_graph(X, n_neighbors=n_neighbors, hubness=hubness, hubness_params=hubness_params, metric=metric, mode="connectivity") all_adata[method_name].obsp['distances'] = kneighbors_graph(X, n_neighbors=n_neighbors, hubness=hubness, hubness_params=hubness_params, metric=metric, mode="distance") all_adata[method_name].uns['neighbors'] = {'connectivities_key': 'connectivities', 'distances_key': 'distances', 'params': {'n_neighbors': n_neighbors, 'method': method_name, 'metric': metric}} G, weights = generate_clustering_inputs(X=X, metric=metric, n_neighbors=n_neighbors, weighted=weighted, seed=seed, hubness=hubness, hubness_params=hubness_params) resol, weighted = getNclusters(all_adata[method_name], G, n_clusters=n_clusters, seed=seed, clustering_algo=clustering_algo, flavor='base', weights=weights) if clustering_algo == "louvain": clus = np.array(louvain.find_partition(graph=G, partition_type=louvain.RBConfigurationVertexPartition, weights=weights, resolution_parameter=resol, seed=seed).membership) all_adata[method_name].obs['louvain'] = pd.Categorical(values=clus.astype('U'), categories=natsorted(map(str, np.unique(clus))),) sc.tl.paga(all_adata[method_name], groups="louvain", neighbors_key='neighbors') elif clustering_algo == "leiden": clus = np.array(leidenalg.find_partition(graph=G, partition_type=leidenalg.RBConfigurationVertexPartition, weights=weights, resolution_parameter=resol, seed=seed).membership) all_adata[method_name].obs['leiden'] = pd.Categorical(values=clus.astype('U'), categories=natsorted(map(str, np.unique(clus))),) sc.tl.paga(all_adata[method_name], groups="leiden") print('\t\t\tHubness and PAGA full pipeline:', round((time.time()-start)/60, 2), 'mn') start = time.time() ### PAGA stab ### all_iter = dict() cell_iter = dict() feat_iter = dict() for method_name, (hubness, hubness_params) in hubness_methods.items(): all_iter[method_name] = dict() cell_iter[method_name] = np.zeros((n_iter, adata.n_obs)) feat_iter[method_name] = np.zeros((n_iter, adata.n_vars)) for iter in tqdm(range(n_iter)): feat_bootstrap = np.random.uniform(0, 1, size=adata.n_vars) feat_bootstrap[feat_bootstrap <= bootstrap_size] = 0 feat_bootstrap[feat_bootstrap > bootstrap_size] = 1 feat_bootstrap = feat_bootstrap == 0 cell_bootstrap = np.random.uniform(0, 1, size=adata.n_obs) cell_bootstrap[cell_bootstrap <= bootstrap_size] = 0 cell_bootstrap[cell_bootstrap > bootstrap_size] = 1 cell_bootstrap = cell_bootstrap == 0 cell_iter[method_name][iter, :] = cell_bootstrap feat_iter[method_name][iter, :] = feat_bootstrap uns = {'Order': true_labels[cell_bootstrap]} adata_sampled = anndata.AnnData(adata.X[cell_bootstrap][:, feat_bootstrap], uns=uns) n_clusters2 = len(np.unique(adata_sampled.uns['Order'])) if do_pca: sc.tl.pca(adata_sampled, n_comps=min(adata_sampled.X.shape[1]-1, min(len(adata_sampled.X)-1, n_comps))) X2 = adata_sampled.obsm['X_pca'] else: X2 = adata_sampled.X adata_sampled.obsp["connectivities"] = kneighbors_graph(X2, n_neighbors=n_neighbors, hubness=hubness, hubness_params=hubness_params, metric=metric, mode="connectivity") adata_sampled.obsp["distances"] = kneighbors_graph(X2, n_neighbors=n_neighbors, hubness=hubness, hubness_params=hubness_params, metric=metric, mode="distance") adata_sampled.uns['neighbors'] = {'connectivities_key': 'connectivities', 'distances_key': 'distances', 'params': {'n_neighbors': n_neighbors, 'method': method_name, 'metric': metric}} G2, weights2 = generate_clustering_inputs(X=X2, metric=metric, n_neighbors=n_neighbors, weighted=weighted, seed=seed, hubness=hubness, hubness_params=hubness_params) resol2, weighted2 = getNclusters(adata_sampled, G2, n_clusters=n_clusters2, seed=seed, clustering_algo=clustering_algo, flavor='base', weights=weights2) if clustering_algo == "leiden": clus = np.array(leidenalg.find_partition(graph=G2, partition_type=leidenalg.RBConfigurationVertexPartition, weights=weights2, resolution_parameter=resol2, seed=seed).membership) adata_sampled.obs['leiden'] = pd.Categorical(values=clus.astype('U'), categories=natsorted(map(str, np.unique(clus))),) sc.tl.paga(adata_sampled, groups="leiden") elif clustering_algo == "louvain": clus = np.array(louvain.find_partition(graph=G2, partition_type=louvain.RBConfigurationVertexPartition, weights=weights2, resolution_parameter=resol2, seed=seed).membership) adata_sampled.obs['louvain'] = pd.Categorical(values=clus.astype('U'), categories=natsorted(map(str, np.unique(clus))), ) sc.tl.paga(adata_sampled, groups="louvain") all_iter[method_name]['iter'+str(iter)] = adata_sampled.uns["paga"]["connectivities_tree"] print('\t\t\tPAGA stability pipeline:', round((time.time()-start)/60, 2), 'mn') for method_name in all_adata.keys(): if method_name == "nothing": all_adata[method_name] = anndata.AnnData(X=all_adata[method_name].X, uns={'Order': all_adata[method_name].uns['Order'], 'paga': all_adata[method_name].uns['paga']}, obs=all_adata[method_name].obs) else: all_adata[method_name] = anndata.AnnData(X=all_adata[method_name].X[:, :2], uns={'Order': all_adata[method_name].uns['Order'], 'paga': all_adata[method_name].uns['paga']}, obs=all_adata[method_name].obs) all_adata[method_name].write_h5ad(filename=get_res_path(fname)+'_'+method_name+".h5ad") if method_name not in ["umap", "gauss"]: w = csv.writer(open(get_res_path(fname)+'_'+method_name+"_stab.csv", "w")) for key, val in all_iter[method_name].items(): w.writerow([key, val]) np.savetxt(get_res_path(fname)+'_'+method_name+"_stab_cell.csv", cell_iter[method_name], delimiter=',', fmt='%d') np.savetxt(get_res_path(fname)+'_'+method_name+"_stab_feat.csv", feat_iter[method_name], delimiter=',', fmt='%d')