def test_spectral_eigen_tol_auto(monkeypatch, solver): """Test that `eigen_tol="auto"` is resolved correctly""" if solver == "amg" and not pyamg_available: pytest.skip("PyAMG is not available.") X, _ = make_blobs(n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01) D = pairwise_distances(X) # Distance matrix S = np.max(D) - D # Similarity matrix solver_func = eigsh if solver == "arpack" else lobpcg default_value = 0 if solver == "arpack" else None if solver == "amg": S = sparse.csr_matrix(S) mocked_solver = Mock(side_effect=solver_func) monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver) spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto") mocked_solver.assert_called() _, kwargs = mocked_solver.call_args assert kwargs["tol"] == default_value
def test_spectral_embedding_deterministic(): # Test that Spectral Embedding is deterministic random_state = np.random.RandomState(36) data = random_state.randn(10, 30) sims = rbf_kernel(data) embedding_1 = spectral_embedding(sims) embedding_2 = spectral_embedding(sims) assert_array_almost_equal(embedding_1, embedding_2)
def __sklearn_spectral_clustering(adj_matrix, n_clusters): """ :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight :param n_clusters: cluster partitioning constant :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time """ from sklearn.cluster import k_means from sklearn.neighbors import kneighbors_graph from sklearn.manifold import spectral_embedding connectivity = kneighbors_graph(adj_matrix, n_neighbors=10, include_self=True) affinity_matrix_ = 0.5 * (connectivity + connectivity.T) eigen_vectors = spectral_embedding( affinity_matrix_, n_components=n_clusters, eigen_solver="arpack", eigen_tol=0.0, norm_laplacian=True, drop_first=False, ) _, labels, _, num_iterations = k_means(eigen_vectors, n_clusters=n_clusters, return_n_iter=True) smallest_cluster_size = min(np.sum(labels), abs(np.sum(labels) - labels.size)) return labels, num_iterations, smallest_cluster_size
def spectral_clustering_scores(train_test_split, random_state=0): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # 加载输入划分集 start_time = time.time() sc_scores = {} # 进行谱聚类链接预测 spectral_emb = spectral_embedding(adj_train, n_components=16, random_state=random_state) sc_score_matrix = np.dot(spectral_emb, spectral_emb.T) runtime = time.time() - start_time sc_test_roc, sc_test_ap = get_roc_score(test_edges, test_edges_false, sc_score_matrix, apply_sigmoid=True) sc_val_roc, sc_val_ap = get_roc_score(val_edges, val_edges_false, sc_score_matrix, apply_sigmoid=True) # 记录得分 sc_scores['test_roc'] = sc_test_roc # sc_scores['test_roc_curve'] = sc_test_roc_curve sc_scores['test_ap'] = sc_test_ap sc_scores['val_roc'] = sc_val_roc # sc_scores['val_roc_curve'] = sc_val_roc_curve sc_scores['val_ap'] = sc_val_ap sc_scores['runtime'] = runtime return sc_scores
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans'): if assign_labels not in ('kmeans', 'discretize', 'AgglomerativeClustering'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters) else: labels = discretize(maps, random_state=random_state) return labels
def fit(self, X, y=None): """Fit the model from data in X. Parameters ---------- X : ndarray, shape (n_trials, n_channels, n_channels) ndarray of SPD matrices. Returns ------- self : object Returns the instance itself. """ affinity_matrix = self._get_affinity_matrix(X, self.eps) embd = spectral_embedding(adjacency=affinity_matrix, n_components=self.n_components, norm_laplacian=True) # normalize the embedding between -1 and +1 embdn = 2*(embd - embd.min(0)) / embd.ptp(0) - 1 self.embedding_ = embdn return self
def fit(self, X, y=None): """Fit the model from data in X. Parameters ---------- X : ndarray, shape (n_matrices, n_channels, n_channels) Set of SPD matrices. y : None Not used, here for compatibility with sklearn API. Returns ------- self : object Returns the instance itself. """ _check_dimensions(X, n_components=self.n_components) affinity_matrix = self._get_affinity_matrix(X, self.eps) embd = spectral_embedding(adjacency=affinity_matrix, n_components=self.n_components, norm_laplacian=True) # normalize the embedding between -1 and +1 embdn = 2 * (embd - embd.min(0)) / embd.ptp(0) - 1 self.embedding_ = embdn return self
def spectral_clustering_scores(train_test_split, random_state=0): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split # Unpack input start_time = time.time() sc_scores = {} # Perform spectral clustering link prediction spectral_emb = spectral_embedding(adj_train, n_components=16, random_state=random_state) sc_score_matrix = np.dot(spectral_emb, spectral_emb.T) runtime = time.time() - start_time sc_test_roc, sc_test_ap = get_roc_score(test_edges, test_edges_false, sc_score_matrix, apply_sigmoid=True) sc_val_roc, sc_val_ap = get_roc_score(val_edges, val_edges_false, sc_score_matrix, apply_sigmoid=True) # Record scores sc_scores['test_roc'] = sc_test_roc # sc_scores['test_roc_curve'] = sc_test_roc_curve sc_scores['test_ap'] = sc_test_ap sc_scores['val_roc'] = sc_val_roc # sc_scores['val_roc_curve'] = sc_val_roc_curve sc_scores['val_ap'] = sc_val_ap sc_scores['runtime'] = runtime return sc_scores
def order_func(times, data): this_data = data[:, (times > 0.0) & (times < 0.350)] this_data /= np.sqrt(np.sum(this_data**2, axis=1))[:, np.newaxis] return np.argsort( spectral_embedding(rbf_kernel(this_data, gamma=1.), n_components=1, random_state=0).ravel())
def matrix_factorization(model_name): """generate embedding by laplacian eigenmaps""" embedding = spectral_embedding(nx.adjacency_matrix(ut.graph), n_components=64) # save embedding np.save(ut.embedding_path + model_name + "_embedding.npy", embedding) return embedding
def test_spectral_embedding_amg_solver_failure(dtype, seed=36): # Non-regression test for amg solver failure (issue #13393 on github) num_nodes = 100 X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed) X = X.astype(dtype) upper = sparse.triu(X) - sparse.diags(X.diagonal()) sym_matrix = upper + upper.T embedding = spectral_embedding( sym_matrix, n_components=10, eigen_solver="amg", random_state=0 ) # Check that the learned embedding is stable w.r.t. random solver init: for i in range(3): new_embedding = spectral_embedding( sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1 ) _assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
def get_spectral_coords(ad, n_dim = 2, epsilon = 1e-14): max_val = np.amax(ad) # ad = (max_val + epsilon) - ad # ad = 1/(ad + epsilon) nonzero = np.nonzero(ad) ad[nonzero] = (max_val + epsilon) - ad[nonzero] # print(ad) return manifold.spectral_embedding(ad, n_components=n_dim)
def spectral_embedding(graph, num_sets): embedding = manifold.spectral_embedding(graph, n_components=2 * num_sets, eigen_solver=None, random_state=None, eigen_tol=0.0, norm_laplacian=False, drop_first=True) embedding = normalize(embedding, axis=0, norm='l2') return embedding
def test_spectral_embedding_copy_variable(seed=36): # Test that when "copy" input variable is set to False # spectral_embedding returns the correct value random_state = np.random.RandomState(36) data = random_state.randn(10, 30) sims = rbf_kernel(data) n_components = 8 embedding_1 = spectral_embedding(sims, n_components=n_components, drop_first=False, copy=False) # Verify with copy True or False embedding_2 = spectral_embedding(sims, n_components=n_components, drop_first=False, copy=True) assert_array_almost_equal(embedding_1, embedding_2)
def fit_lumap(X, n_neighbors, metric, n_components=2): """ Build the fuzzy simplices UMAP-style (via fuzzy unions of local metric spaces) and then fit the matrix Laplacian Eigenmaps style (via graph laplacian) """ sparse_graph, sigmas, rhos = fuzzy_simplicial_set( X=X, random_state=check_random_state(0), n_neighbors=n_neighbors, metric=metric) return spectral_embedding(sparse_graph, n_components=n_components)
def node_features(graph, k): ''' 功能描述:根据给出的图graph,求其给定的k维的node feature 输入参数:需要求的node feature的图 输出参数:图上每个节点的feature ''' adj_matrix = graph_tool.spectral.adjacency(graph, weight=None, index=None) adj_matrix = adj_matrix.todense() print(adj_matrix.shape) node_feature = spectral_embedding(adj_matrix, k) return node_feature
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', fuzzy_m=2, fuzzy_error=0.0005, fuzzy_maxiter=10000, fuzzy_label_threshold=None): if assign_labels not in ('kmeans', 'fuzzy_cmeans', 'discretize'): raise ValueError( "The 'assign_labels' parameter should be " "'kmeans', 'fuzzy_cmeans' or 'discretize', but '%s' was given" % assign_labels) random_state_ = sp.check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = sp.k_means(maps, n_clusters, random_state=random_state_, n_init=n_init) elif assign_labels == 'fuzzy_cmeans': if fuzzy_label_threshold is None: fuzzy_label_threshold = 1. / n_clusters _, u, _, _, _, _, _ = fuzz.cluster.cmeans(np.exp(maps.T), n_clusters, seed=random_state, m=fuzzy_m, error=fuzzy_error, maxiter=fuzzy_maxiter) # from sklearn.mixture import GMM # gmm = GMM(n_components=n_clusters, covariance_type='full', random_state=random_state, n_init=n_init).fit(maps) # u = gmm.predict_proba(maps) # u = u.T assignments = np.argwhere(u.T >= fuzzy_label_threshold) labels = [[] for _ in range(u.shape[1])] for row in assignments: labels[row[0]].append(row[1]) else: labels = sp.discretize(maps, random_state=random_state_) return labels
def spectral_embedding(self, n): """ Embed the points using spectral decomposition of the laplacian of the affinity matrix Parameters ---------- n: int The number of dimensions """ coords = spectral_embedding(self._affinity, n) return CoordinateMatrix(normalise_rows(coords))
def embed(self,save=False): try: embedding = np.load("./results2/embedding.npy") except: G = self.indexedGraph() "starting manifold learning..." A = nx.adjacency_matrix(G, nodelist=G.nodes()) embedding = manifold.spectral_embedding(A, n_components=self.embedding_n) if save: np.save("./results2/embedding.npy",embedding) self.embedding = embedding return embedding
def my_spectral(graph): ad = nx.to_numpy_array(graph) a = manifold.spectral_embedding(ad, n_components=2) xs = a[:, 0] ys = a[:, 1] plt.scatter(xs, ys) for i in range(len(xs)): plt.annotate(i, (xs[i], ys[i])) plt.show()
def test_normalized_embedding(self): x = np.array([[1, 0], [0, 1], [3, 0], [4, 1]]) sc = SpectralClustering(2) sc.affinity_matrix_ = sc._get_affinity_matrix(x) embedding_features_standard = spectral_embedding(sc.affinity_matrix_, n_components=2, norm_laplacian=True, drop_first=False) embedding_features = sc._get_embedding(norm_laplacian=True) all_one_vector = embedding_features[:, 0] / embedding_features[0, 0] assert_array_almost_equal(all_one_vector, np.ones(4)) second_vector = embedding_features[:, 1] / embedding_features[0, 1] second_vector_standard = embedding_features_standard[:, 1] / embedding_features_standard[0, 1] assert_array_almost_equal(second_vector, second_vector_standard)
def test_spectral_embedding_amg_solver_failure(): # Non-regression test for amg solver failure (issue #13393 on github) pytest.importorskip('pyamg') seed = 36 num_nodes = 100 X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed) upper = sparse.triu(X) - sparse.diags(X.diagonal()) sym_matrix = upper + upper.T embedding = spectral_embedding(sym_matrix, n_components=10, eigen_solver='amg', random_state=0) # Check that the learned embedding is stable w.r.t. random solver init: for i in range(3): new_embedding = spectral_embedding(sym_matrix, n_components=10, eigen_solver='amg', random_state=i + 1) assert _check_with_col_sign_flipping(embedding, new_embedding, tol=0.05)
def fit_laplacian_eigenmaps(X, n_neighbors=20, metric=EUCLIDEAN, n_components=2): """ spectral_embedding expects an affinity matrix that already has the similarity kernel applied. We apply the exponent here to be consistent with the mapping from distances to fuzzy simplices (affinities) via -log """ print("Computing adjacency_matrix with {} neighbors and {} metric".format( n_neighbors, metric)) graph = get_adjacency_matrix(X=X, n_neighbors=n_neighbors, metric=metric) print("Computing spectral_embedding") return spectral_embedding(graph, n_components=2)
def test_spectral_embedding_unnormalized(): # Test that spectral_embedding is also processing unnormalized laplacian correctly random_state = np.random.RandomState(36) data = random_state.randn(10, 30) sims = rbf_kernel(data) n_components = 8 embedding_1 = spectral_embedding(sims, norm_laplacian=False, n_components=n_components, drop_first=False) # Verify using manual computation with dense eigh laplacian, dd = graph_laplacian(sims, normed=False, return_diag=True) _, diffusion_map = eigh(laplacian) embedding_2 = diffusion_map.T[:n_components] * dd embedding_2 = _deterministic_vector_sign_flip(embedding_2).T assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding(self): N = 10 m = np.random.random_integers(50, 200, size=(N, N)) m = (m + m.T) / 2 df = pdml.ModelFrame(m) self.assert_numpy_array_almost_equal(df.data.values, m) result = df.manifold.spectral_embedding(random_state=self.random_state) expected = manifold.spectral_embedding(m, random_state=self.random_state) self.assertTrue(isinstance(result, pdml.ModelFrame)) self.assert_index_equal(result.index, df.index) # signs can be inversed self.assert_numpy_array_almost_equal(np.abs(result.data.values), np.abs(expected))
def test_spectral_embedding(self): N = 10 m = np.random.random_integers(50, 200, size=(N, N)) m = (m + m.T) / 2 df = pdml.ModelFrame(m) self.assert_numpy_array_almost_equal(df.data.values, m) result = df.manifold.spectral_embedding(random_state=self.random_state) expected = manifold.spectral_embedding(m, random_state=self.random_state) self.assertIsInstance(result, pdml.ModelFrame) self.assert_index_equal(result.index, df.index) # signs can be inversed self.assert_numpy_array_almost_equal(np.abs(result.data.values), np.abs(expected))
def spectral_clustering_scores(train_test_split, random_state=0): adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = train_test_split start_time = time.time() sc_score = {} # spectral clustering spectral_emb = spectral_embedding(adj_train, n_components=16, random_state=random_state) sc_score_matrix = np.dot(spectral_emb, spectral_emb.T) train_edges_corr, train_edges_label = get_correlation( train_edges, train_edges_false, sc_score_matrix) if len(val_edges) > 0 and len(val_edges_false) > 0: val_edges_corr, val_edges_label = get_correlation( val_edges, val_edges_false, sc_score_matrix) test_edges_corr, test_edges_label = get_correlation( test_edges, test_edges_false, sc_score_matrix) classifier = get_prediction_model(train_edges_corr, train_edges_label) if len(val_edges) > 0 and len(val_edges_false) > 0: val_preds = classifier.predict(val_edges_corr) test_preds = classifier.predict(test_edges_corr) run_time = time.time() - start_time if len(val_edges) > 0 and len(val_edges_false) > 0: sc_val_roc = roc_auc_score(val_edges_label, val_preds) sc_val_avg = average_precision_score(val_edges_label, val_preds) else: sc_val_roc = None sc_val_avg = None sc_test_roc = roc_auc_score(test_edges_label, test_preds) sc_test_avg = average_precision_score(test_edges_label, test_preds) run_time = time.time() - start_time # sc_test_roc, sc_test_ap = get_roc_score(test_edges, test_edges_false, sc_score_matrix, apply_sigmoid=True) # sc_val_roc, sc_val_ap = get_roc_score(val_edges, val_edges_false, sc_score_matrix, apply_sigmoid=True) sc_score['test_roc'] = sc_test_roc sc_score['test_ap'] = sc_test_avg sc_score['val_roc'] = sc_val_roc sc_score['val_ap'] = sc_val_avg sc_score['run_time'] = run_time return sc_score
def test_spectral_embedding_first_eigen_vector(): # Test that the first eigenvector of spectral_embedding # is constant and that the second is not (for a connected graph) random_state = np.random.RandomState(36) data = random_state.randn(10, 30) sims = rbf_kernel(data) n_components = 2 for seed in range(10): embedding = spectral_embedding(sims, norm_laplacian=False, n_components=n_components, drop_first=False, random_state=seed) assert np.std(embedding[:, 0]) == pytest.approx(0) assert np.std(embedding[:, 1]) > 1e-3
def le(data, k = 10, target_dim = 2): graph = KNN.knn(data, k) A = construct_mesh(data, graph) from sklearn import manifold return(manifold.spectral_embedding(A, 2)) D = construct_degree(A) L = D - A eigvals, eigvecs = scipy.linalg.eigh(A, L) index = np.argsort(eigvals)[::-1] eigvals = eigvals[index] eigvecs = eigvecs[:,index] return eigvecs[:,1:target_dim + 1] # print(le(npdata))
def initial_dictionary( n_clusters, X, ): """Creat initial dictionary""" from sklearn.cluster import MiniBatchKMeans kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, batch_size=200, n_init=10) kmeans = kmeans.fit(X.T) dictionary_ = kmeans.cluster_centers_ dictionary = (dictionary_.T / np.sqrt((dictionary_**2).sum(1))).T similarity = np.exp(np.corrcoef(dictionary)) embedding = spectral_embedding(similarity, n_components=1) order = np.argsort(embedding.T).ravel() dictionary = dictionary[order] return dictionary
def le(data, k=10, target_dim=2): graph = KNN.knn(data, k) A = construct_mesh(data, graph) from sklearn import manifold return (manifold.spectral_embedding(A, 2)) D = construct_degree(A) L = D - A eigvals, eigvecs = scipy.linalg.eigh(A, L) index = np.argsort(eigvals)[::-1] eigvals = eigvals[index] eigvecs = eigvecs[:, index] return eigvecs[:, 1:target_dim + 1] # print(le(npdata))
def compute_manifold_eigenvector(graph: nx.Graph, num: int, normalised: bool = False): """ Computes the eigenvectors through the amg solver :param graph: graph on which the eigenvectors are computed :param num: number of eigenvectors to be computed :param normalised: flag defining whether the Laplacian matrix should be normalised or not :return: embedding whose columns are the eigenvectors """ embedding = spectral_embedding( nx.adjacency_matrix(graph), n_components=num, eigen_solver='amg', random_state=0, # int(os.environ["random_state_embedding"]), eigen_tol=0.0, drop_first=False, norm_laplacian=normalised) return embedding
assign_undirected_weight(W,1,3,0.22) assign_undirected_weight(W,1,4,0.24) assign_undirected_weight(W,2,3,0.2) assign_undirected_weight(W,2,4,0.19) assign_undirected_weight(W,3,4,1) D = np.zeros((n,n)) for i in V: D[i,i] = np.sum(W[i,:]) D[D == 0] = 1e-8 #don't laugh, there is a core package in R actually does this print W print D D_hat = D**(-0.5) L = D_hat * W * D_hat print L print "==================" #labels = spectral_clustering(A, n_clusters = 2) random_state = check_random_state(None) maps = spectral_embedding(L, n_components = 2, eigen_solver = None, random_state = random_state, eigen_tol = 0.0, drop_first= False) print maps _, labels, _ = k_means(maps,n_clusters=2,random_state=random_state,n_init=10) print labels
def reproducibility_selection( X, grp_mask, niter=2, method='ward', k_range=KRANGE, write_dir='/tmp', verbose=True): """ Returns a reproducibility metric on bootstraped models Parameters ---------- X: array of shape (n_voxels, n_contrasts, n_subjects) the input data grp_mask: array of shape (image_shape), the non-zeros elements yield the spatial model niter: int, number of bootstrap samples estimated method: string, one of 'ward', 'kmeans', 'spectral' k_range: list of ints, the possible number of parcels to be tested """ n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) # pre-computed stuff ic, jc = connectivity.nonzero() sigma = np.sum((Xv[ic] - Xv[jc]) ** 2, 1).mean() maps = [] for i in range(niter): bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int) X_ = Xv[:, bootstrap] if method == 'spectral': connectivity.data = np.exp( - np.sum((X_[ic] - X_[jc]) ** 2, 1) / (2 * sigma)) maps.append(spectral_embedding(connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False)) else: maps.append(PCA(n_components=n_components).fit_transform(X_)) ars_score = {} ami_score = {} vm_score = {} for (ik, k_) in enumerate(k_range): label_ = [] for i in range(niter): bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int) if method == 'spectral': if k_ <= n_components: for _ in range(10): labels = discretize(maps[i][:, :k_]) if len(np.unique(labels)) == k_: break else: _, labels, _ = k_means( maps[i], n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) elif method == 'ward': ward = Ward(n_clusters=k_, connectivity=connectivity).fit(maps[i]) labels = ward.labels_ elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(maps[i], n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) label_.append(labels) ars_score[k_] = reproducibility_rating(label_, 'ars') ami_score[k_] = reproducibility_rating(label_, 'ami') vm_score[k_] = reproducibility_rating(label_, 'vm') if verbose: print 'k: ', k_, ' ari: ', ars_score[k_], 'ami: ',ami_score[k_],\ ' vm: ', vm_score[k_] file = open(path.join(write_dir, 'ari_score_%s.pck' % method), 'w') pickle.dump(ars_score, file) file = open(path.join(write_dir, 'ami_score_%s.pck' % method), 'w') pickle.dump(ami_score, file) file = open(path.join(write_dir, 'vm_score_%s.pck' % method), 'w') pickle.dump(vm_score, file) return ars_score, ami_score, vm_score
def parcel_cv(X, grp_mask, write_dir='/tmp/', method='ward', n_folds=10, k_range=KRANGE, verbose=True): """ Functiond edicated to parcel selection using 10-fold cross-validation""" from sklearn.cross_validation import KFold, ShuffleSplit # Define the structure A of the data. Pixels connected to their neighbors. n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() ic, jc = connectivity.nonzero() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) sigma = np.sum((Xv[ic] - Xv[jc]) ** 2, 1).mean() # pre-compute PCA for the cross_validation loops if n_folds == int(n_folds): cv = KFold(X.shape[2], n_folds) else: cv = ShuffleSplit(X.shape[2], 10, .2) maps = [] for (train, test) in cv: X_ = np.reshape(X[:, :, train], (n_voxels, n_contrasts * len(train))) if method == 'spectral': connectivity.data = np.exp( - np.sum((X_[ic] - X_[jc]) ** 2, 1) / (2 * sigma)) maps.append(spectral_embedding( connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False)) else: maps.append(PCA(n_components=n_components).fit_transform(X_)) # parcel selection all_crit = {} for k in k_range: ll, ll_cv = 0, 0 for (it, (train, test)) in enumerate(cv): if method == 'ward': ward = Ward(n_clusters=k, connectivity=connectivity).fit(maps[it]) labels = ward.labels_ elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(maps[it], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'spectral': if k <= n_components: for i in range(10): labels = discretize(maps[it][:, :k]) if len(np.unique(labels)) == k: break else: _, labels, _ = k_means( maps[it], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) for contrast in range(n_contrasts): ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map( X[:, contrast, train], labels, null=False) ll += ll1.sum() ll2 = log_likelihood_map( X[:, contrast, test], labels, mu_, sigma1_, sigma2_) ll_cv += ll2.sum() all_crit[k] = ll_cv if verbose: print 'k: ', k, 'll: ', ll, ' ll_cv: ', ll_cv file = open(path.join( write_dir, 'll_cv_%s.pck' % method), 'w') pickle.dump(all_crit, file) return all_crit
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, k=None, eigen_tol=0.0, assign_labels='kmeans', mode=None): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of the individual clusters is highly non-convex or more generally when a measure of the center and spread of the cluster is not a suitable description of the complete cluster. For instance when clusters are nested circles on the 2D plan. If affinity is the adjacency matrix of a graph, this method can be used to find normalized graph cuts. Parameters ----------- affinity: array-like or sparse matrix, shape: (n_samples, n_samples) The affinity matrix describing the relationship of the samples to embed. **Must be symmetric**. Possible examples: - adjacency matrix of a graph, - heat kernel of the pairwise distance matrix of the samples, - symmetric k-nearest neighbours connectivity matrix of the samples. n_clusters: integer, optional Number of clusters to extract. n_components: integer, optional, default is k Number of eigen vectors to use for the spectral embedding eigen_solver: {None, 'arpack' or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities random_state: int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by the K-Means initialization. n_init: int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. eigen_tol : float, optional, default: 0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for more details on the discretization approach. Returns ------- labels: array of integers, shape: n_samples The labels of the clusters. References ---------- - Normalized cuts and image segmentation, 2000 Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 - A Tutorial on Spectral Clustering, 2007 Ulrike von Luxburg http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 - Multiclass spectral clustering, 2003 Stella X. Yu, Jianbo Shi http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf Notes ------ The graph should contain only one connect component, elsewhere the results make little sense. This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ if not assign_labels in ('kmeans', 'discretize'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) if not k is None: warnings.warn("'k' was renamed to n_clusters and will " "be removed in 0.15.", DeprecationWarning) n_clusters = k if not mode is None: warnings.warn("'mode' was renamed to eigen_solver " "and will be removed in 0.15.", DeprecationWarning) eigen_solver = mode random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) else: labels = discretize(maps, random_state=random_state) return labels, maps
def drc(X, k, Gamma=0.5, Const=1.0): # X = data (n,d), k = num of clusters, gamma = 1/sigma^2 n = X.shape[0] d = X.shape[1] A = np.eye(d) H = np.eye(n) - (1.0/n)*np.ones((n,n)) U_converged = False delta = 0.001 escape_count = 20 output = {} ## Calculate initial U # [output['init_allocation'], U] = spectral_clustering(X,k, 3) # print output['init_allocation'] # output['allocation'] = output['init_allocation'] C = sklearn.metrics.pairwise.rbf_kernel(X, gamma=Gamma) # import pdb; pdb.set_trace() # U = spectral_embedding(C, n_components=k) clf = KMeans(n_clusters=k) output['init_allocation'] = clf.fit_predict(U) while U_converged == False: for rep in range(12): part_1 = np.linalg.inv(A + delta*np.eye(d)) part_2 = X.T.dot(H).dot(U).dot(U.T).dot(H).dot(X) n_1 = np.linalg.norm(part_1,'fro'); n_2 = np.linalg.norm(part_2,'fro'); lmbda = n_1/n_2; #lmbda = 1; for count in range(escape_count): FI = part_1 - lmbda*Const*np.power(1.1,count+1)*part_2 #FI = lmbda*Const*np.power(1.1,count+1)*part_2 - part_1 #print '\t\tpart 1 size : ', str(np.linalg.norm(part_1)) #print '\t\tpart 2 size : ', str(np.linalg.norm(lmbda*np.power(1.1,count+1)*part_2)) V,D = eig_sorted(FI) reduced_dim = np.sum(D < 0) if(reduced_dim < 1): count += 1 else: break; if count == escape_count: print 'Error : Your Const is too small, try a larger value.' exit() L = V[:,-reduced_dim:] new_A = L.dot(L.T) if(np.linalg.norm(new_A - A) < 0.001*np.linalg.norm(A)): break; else: A = new_A embed_dim = k if(reduced_dim < k): embed_dim = reduced_dim C = sklearn.metrics.pairwise.rbf_kernel(X.dot(L), gamma=Gamma) U_new = spectral_embedding(C, n_components=embed_dim) U_diff = np.linalg.norm(U_new[:,0:embed_dim] - U[:,0:embed_dim]) print U_diff if(U_diff < 0.001*np.linalg.norm(U)): U_converged = True output['allocation'] = allocation output['L'] = L print L.shape U = U_new[:,0:k] clf = KMeans(n_clusters=k) allocation = clf.fit_predict(U) return output
mdata=np.vstack((mdata,data[i])) for i in lnames: print i cmatrix=correlationMatrix(mdata,0,400000,10) corrmin=0.1 for i in range(cmatrix.shape[0]): for j in range(cmatrix.shape[1]): if cmatrix[i,j]<0: #cmatrix[i,j]=-cmatrix[i,j] cmatrix[i,j]=0 Pr=spectral_embedding(cmatrix,n_components=3) labels = spectral_clustering(cmatrix, n_clusters=6, eigen_solver='arpack',assign_labels='discretize') #clcen,labels=affinity_propagation(cmatrix,damping=0.5) cm_bright = ListedColormap(['#000000','#FF0000','#00FF00','#0000FF','#FF00FF' ,'#FFFF00','#00FFFF','#9999FF','#FF9999','#99FF99']) #print Pr fig = plt.figure() ax = fig.add_subplot(1,1,1, projection='3d') plt.scatter(Pr[:,0],Pr[:,1],zs=Pr[:,2],c=labels,cmap=cm_bright,s=25,marker='o')
def parcel_selection(X, grp_mask, write_dir='/tmp/', method='ward', k_range=KRANGE, criterion='ll', verbose=True): """ Functiond edicated to parcel selection """ # Define the structure A of the data. Pixels connected to their neighbors. n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) X_ = PCA(n_components=n_components).fit_transform(Xv) i, j = connectivity.nonzero() sigma = np.sum((Xv[i] - Xv[j]) ** 2, 1).mean() if method == 'spectral': i, j = connectivity.nonzero() sigma = np.sum((Xv[i] - Xv[j]) ** 2, 1).mean() connectivity.data = np.exp( - np.sum((Xv[i] - Xv[j]) ** 2, 1) / (2 * sigma)) maps = spectral_embedding(connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False) del Xv # parcel selection all_bic = {} all_crit = {} for k in k_range: if method == 'ward': ward = Ward(n_clusters=k, connectivity=connectivity).fit(X_) labels = ward.labels_ elif method == 'spectral': if k <= n_components: for i in range(10): labels = discretize(maps[:, :k]) if len(np.unique(labels)) == k: break else: _, labels, _ = k_means(maps[:, :100], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(X_, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'gmm': from sklearn.mixture import GMM labels = GMM(n_components=k, covariance_type='spherical', n_iter=10, n_init=1).fit(X_).predict(X_) ll, bic = 0, 0 for contrast in range(n_contrasts): ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map( X[:, contrast], labels, null=False) bic += bic_.sum() if criterion == 'log-LR': ll2, _, _, _, bic_ = parameter_map( X[:, contrast], labels, null=True) ll += np.sum((ll1 - ll2)) elif criterion == 'll': ll += np.sum(ll1) elif criterion == 'sigma': ll = (sigma1_.mean(), sigma2_.mean()) elif criterion == 'kfold': ll += score_spatial_model(X[:, contrast], labels, cv='kfold') all_crit[k] = ll all_bic[k] = bic if verbose: print 'k: ', k, ' bic: ', bic, ' crit: ', ll if criterion == 'log-LR': file = open(path.join( write_dir, 'all_llr_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'll': file = open(path.join( write_dir, 'all_ll_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'sigma': file = open(path.join( write_dir, 'all_sigma_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'kfold': file = open(path.join( write_dir, 'all_kfold_%s.pck' % method), 'w') pickle.dump(all_crit, file) file = open(path.join( write_dir, 'all_bic_%s.pck' % method), 'w') pickle.dump(all_bic, file) return all_crit, all_bic
import gzip import cPickle import numpy f = gzip.open('mnist.pkl.gz', 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() # perform SC on the test set data_x, data_y = test_set k = 12 nClass = 500 A = kneighbors_graph(data_x, k) V = spectral_embedding(A, n_components = 10, drop_first = False) V = V + numpy.absolute(numpy.min(V)) #V = V/numpy.amax(V) # km_model = KMeans(n_clusters = nClass) ypred = km_model.fit_predict(V) nmi = metrics.normalized_mutual_info_score(data_y, ypred) print('The NMI is: %.4f'%nmi) # V = numpy.float32(V) f = gzip.open('EVD-test500.pkl.gz', 'wb') cPickle.dump([(V, data_y), 0, 0], f, protocol = 2) f.close() #sio.savemat('V_train_10.mat', {'train_x': V, 'train_y': data_y})
def cloudstering(dendrogram, catalog, criteria, user_k, user_ams, user_scalpars, user_iter, save_isol_leaves, save_clust_leaves, save_branches, blind, rms, s2nlim, locscal): """ SCIMES main function. It collects parents/children of all structrures within the dendrogram, and their properties. It calls the affinity matrix-related functions (for creation, rescaling, cluster counting), and it runs several time the actual spectral clustering routine by calculating every time the silhouette of the current configuration. Input parameter are passed by the SpectralCloudstering class. Parameters ----------- dendrogram: 'astrodendro.dendrogram.Dendrogram' instance The dendrogram to clusterize. catalog: 'astropy.table.table.Table' instance A catalog containing all properties of the dendrogram structures. Generally generated with ppv_catalog module. header: 'astropy.io.fits.header.Header' instance The header of the fits data the dendrogram was generated from. Necessary to obtain the assignment cubes. criteria: list of strings Clustering criteria referred to the structure properties in the catalog (default ['volume', 'luminosity']). user_k: int The expected number of clusters, if not provided it will be guessed automatically through the eigenvalues of the unsmoothed affinity matrix. user_ams: numpy array User provided affinity matrix. Whether this is not furnish it is automatically generated through the volume and/or luminosity criteria. user_scalpars: list of floats User-provided scaling parameters to smooth the affinity matrices. user_iter: int User-provided number of k-means iterations. save_isol_leaves: bool Consider the isolated leaves (without parent) as individual 'clusters'. Useful for low resolution data where the beam size corresponds to the size of a Giant Molecular Cloud. save_clust_leaves: bool Consider unclustered leaves as individual 'clusters'. This keyword will not include the isolated leaves without parents. save_all_leaves: bool Trigger both save_isol_leaves and save_clust_leaves. save_branches: bool Retain all isolated branches usually discarded by the cluster analysis. save_all: bool Trigger all save_isol_leaves, save_clust_leaves, and save_branches. rms: int or float Noise level of the observation. Necessary tolist calculate the scaling parameter above a certain signal-to-noise ratio. s2nlim: int or float Signal-to-noise limit above which the scaling parameter is calculated. Needed only if rms is not np.nan. blind: bool Show the affinity matrices. Matplotlib required. locscaling: bool Smooth the affinity matrices using a local scaling technique. Return ------- clusts: list The dendrogram branch indexes corresponding to the identified clusters catalog: 'astropy.table.table.Table' instance The input catalog updated with dendrogram structure parent, ancestor, number of leaves, and type ('T', trunks or branches without parent; 'B', branches with parent; 'L', leaves). AMs: numpy array The affinity matrices calculated by the algorithm escalpars: list Estimated scaling parameters for the different affinity matrixes silhouette: float Silhouette of the best cluster configuration """ # Collecting all connectivity and other information into more handy lists all_structures_idx = np.arange(len(catalog[criteria[0]].data), dtype='int') all_levels = [] brc_levels = [] all_leav_names = [] all_leav_idx = [] all_brc_names = [] all_brc_idx = [] all_parents = [] all_children = [] all_struct_names = [] all_ancestors = [] all_struct_ancestors = [] all_struct_parents = [] all_struct_types = [] nleaves = [] trunk_brs_idx = [] two_clust_idx = [] mul_leav_idx = [] s2ns = [] for structure_idx in all_structures_idx: s = dendrogram[structure_idx] all_levels.append(s.level) s2ns.append(dendrogram[structure_idx].height/rms) all_struct_names.append(str(s.idx)) all_struct_ancestors.append(s.ancestor.idx) if s.parent: all_struct_parents.append(s.parent.idx) else: all_struct_parents.append(-1) nleaves.append(len(s.sorted_leaves())) ancestors = [] anc = s.parent while anc != None: ancestors.append(anc.idx) anc = anc.parent ancestors.append(s.idx) all_ancestors.append(ancestors) # If structure is a leaf find all the parents if s.is_leaf and s.parent != None: par = s.parent all_leav_names.append(str(s.idx)) parents = [] while par != None: parents.append(par.idx) par = par.parent parents.append(len(catalog[criteria[0]].data)) # This is the trunk! all_parents.append(parents) # If structure is a brach find all its leaves if s.is_branch: brc_levels.append(s.level) all_brc_idx.append(s.idx) all_brc_names.append(str(s.idx)) children = [] for leaf in s.sorted_leaves(): children.append(leaf.idx) all_children.append(children) # Trunk branches if s.parent == None: trunk_brs_idx.append(s.idx) all_leav_idx = all_leav_idx + children if s.children[0].is_branch or s.children[1].is_branch: mul_leav_idx = mul_leav_idx + children else: two_clust_idx.append(s.idx) all_struct_types.append('T') else: all_struct_types.append('B') else: all_struct_types.append('L') two_clust_idx = np.unique(two_clust_idx).tolist() dict_parents = dict(zip(all_leav_names,all_parents)) dict_children = dict(zip(all_brc_names,all_children)) dict_ancestors = dict(zip(all_struct_names,all_ancestors)) all_levels.append(-1) all_levels = np.asarray(all_levels) # Retriving needed properties from the catalog # and adding fake "trunk" properties props = [] for crit in criteria: prop = catalog[crit].data.tolist() tprop = sum(catalog[crit].data[trunk_brs_idx]) prop.append(tprop) props.append(prop) s2ns.append(1) props.append(s2ns) # Generating affinity matrices if not provided if user_ams == None: AMs = aff_matrix(len(all_leav_idx), len(catalog[criteria[0]].data), \ all_leav_idx, all_brc_idx, brc_levels, dict_children, props) if blind == False: # Showing all affinity matrices for i, crit in enumerate(criteria): plt.matshow(AMs[i,:,:]) plt.title('"'+crit+'" affinity matrix', fontsize = 'medium') plt.xlabel('leaf index') plt.ylabel('leaf index') plt.colorbar() else: AMs = user_ams S2Nmat = AMs[-1,:,:] AMs = AMs[:-1,:,:] # Check if the affinity matrix has more than 2 elements # otherwise return everything as clusters ("save_all"). if AMs.shape[1] <= 2: print("--- Not necessary to cluster. 'save_all' keyword triggered") all_leaves = [] for leaf in dendrogram.leaves: all_leaves.append(leaf.idx) clusts = all_leaves return clusts, AMs # Check whether the affinity matrix scaling parameter # are provided by the user, if so use them, otherwise # calculate them """ scpars = np.zeros(len(criteria)) if user_scalpars is not None: print("- Using user-provided scaling parameters") user_scalpars = np.asarray(user_scalpars) scpars[0:len(user_scalpars)] = user_scalpars """ scpars = np.array(user_scalpars) print("- Start spectral clustering") # Selecting the criteria and merging the matrices escalpars = [] AM = np.ones(AMs[0,:,:].shape) for i, crit in enumerate(criteria): print("-- Rescaling %s matrix" % crit) AMc, sigma = mat_smooth(AMs[i,:,:], S2Nmat, s2nlim = s2nlim, scalpar = scpars[i], lscal = locscal) AM = AM*AMc escalpars.append(sigma) # Making the reduced affinity matrices mul_leav_mat = [] for mli in mul_leav_idx: mul_leav_mat.append(all_leav_idx.index(mli)) mul_leav_mat = np.asarray(mul_leav_mat) rAM = AM[mul_leav_mat,:] rAM = rAM[:,mul_leav_mat] if blind == False: # Showing the final affinity matrix plt.matshow(AM) plt.colorbar() plt.title('Final Affinity Matrix') plt.xlabel('leaf index') plt.ylabel('leaf index') # Guessing the number of clusters # if not provided if user_k == 0: kg = guessk(rAM) else: kg = user_k-len(two_clust_idx) print("-- Guessed number of clusters = %i" % (kg+len(two_clust_idx))) if kg > 1: print("-- Number of k-means iteration: %i" % user_iter) # Find the best cluster number sils = [] min_ks = max(2,kg-15) max_ks = min(kg+15,rAM.shape[0]-1) clust_configs = [] for ks in range(min_ks,max_ks): try: evecs = spectral_embedding(rAM, n_components=ks, eigen_solver='arpack', random_state=222, eigen_tol=0.0, drop_first=False) _, all_clusters, _ = k_means(evecs, ks, random_state=222, n_init=user_iter) sil = silhouette_score(evecs, np.asarray(all_clusters), metric='euclidean') clust_configs.append(all_clusters) except np.linalg.LinAlgError: sil = 0 sils.append(sil) # Use the best cluster number to generate clusters best_ks = sils.index(max(sils))+min_ks print("-- Best cluster number found through SILHOUETTE (%f)= %i" % (max(sils), best_ks+len(two_clust_idx))) silhoutte = max(sils) all_clusters = clust_configs[np.argmax(sils)] else: print("-- Not necessary to cluster") all_clusters = np.zeros(len(all_leaves_idx), dtype = np.int32) clust_branches = clust_cleaning(mul_leav_idx, all_clusters, dict_parents, dict_children, dict_ancestors, savebranches = save_branches) clusts = clust_branches + two_clust_idx print("-- Final cluster number (after cleaning) %i" % len(clusts)) # Calculate the silhouette after cluster cleaning #fclusts_idx = np.ones(len(mul_leav_idx)) fclusts_idx = -1*all_clusters i = 1 for clust in clusts: i += 1 fleavs = dendrogram[clust].sorted_leaves() fleavs_idx = [] for fleav in fleavs: fleavs_idx.append(fleav.idx) fleavs_idx = np.asarray(fleavs_idx) # Find the position of the cluster leaves pos = np.where(np.in1d(mul_leav_idx,fleavs_idx))[0] fclusts_idx[pos] = i oldclusts = np.unique(fclusts_idx[fclusts_idx < 0]) for oldclust in oldclusts: fclusts_idx[fclusts_idx == oldclust] = np.max(fclusts_idx)+1 evecs = spectral_embedding(rAM, n_components=ks, eigen_solver='arpack', random_state=222, eigen_tol=0.0, drop_first=False) sil = silhouette_score(evecs, fclusts_idx, metric='euclidean') print("-- Final clustering configuration silhoutte %f" % sil) all_struct_types = np.asarray(all_struct_types) all_struct_parents = np.asarray(all_struct_parents) # Add the isolated leaves to the cluster list, if required if save_isol_leaves: isol_leaves = all_structures_idx[(all_struct_parents == -1) & (all_struct_types == 'L')] clusts = clusts + list(isol_leaves) print("SAVE_ISOL_LEAVES triggered. Isolated leaves added.") print("-- Total cluster number %i" % len(clusts)) # Add the unclustered leaves within clusters to the cluster list, if required if save_clust_leaves: isol_leaves = all_structures_idx[(all_struct_parents == -1) & (all_struct_types == 'L')] all_leaves = [] for leaf in dendrogram.leaves: all_leaves.append(leaf.idx) clust_leaves = [] for clust in clusts: for leaf in dendrogram[clust].sorted_leaves(): clust_leaves.append(leaf.idx) unclust_leaves = list(set(all_leaves)-set(clust_leaves + list(isol_leaves))) clusts = clusts + unclust_leaves print("SAVE_CLUST_LEAVES triggered. Unclustered leaves added.") print("-- Total cluster number %i" % len(clusts)) # Update the catalog with new information catalog['parent'] = all_struct_parents catalog['ancestor'] = all_struct_ancestors catalog['n_leaves'] = nleaves catalog['structure_type'] = all_struct_types return clusts, catalog, AMs, escalpars, silhoutte
images = [ Image.open(os.path.join("images", name)) for name in image_names ] hist = [] for name in image_names: hists = [] for i in range(3): hists.append( cv2.calcHist( [cv2.imread(os.path.join("images", name)).astype('float32') ], [i], None, [20], [0, 256] ) ) hist.append(hists) blocks = {} similarity = np.empty([len(image_names), len(image_names)]) print "Calculating similarities..." for i, image1 in enumerate(images): for j, image2 in enumerate(images): similarity[i, j] = sum( abs( cv2.compareHist(hist[i][k], hist[j][k], cv2.HISTCMP_CORREL) ) for k in range(3) ) positions = manifold.spectral_embedding(similarity, 1) print positions THUMB_WIDTH = 100 THUMB_HEIGHT = 100 sorted_images = sorted(zip(positions, images)) thumbnails = [ImageOps.fit(im, (THUMB_WIDTH, THUMB_HEIGHT)) for pos, im in sorted_images] collage = Image.new('RGB', (THUMB_WIDTH * len(image_names), THUMB_HEIGHT)) for i, im in enumerate(thumbnails): collage.paste(im, (i*THUMB_WIDTH, 0)) collage.show()