def regularized_spectral_clustering(adj_matrix, tau, n_clusters, algo='scan'): """ :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight :param n_clusters: cluster partitioning constant :param algo: the clustering separation algorithm, possible value kmeans++ or scan :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time """ start = timer() regularized_laplacian = regularized_laplacian_matrix(adj_matrix, tau) eigen_values, eigen_vectors = eigen_solver(regularized_laplacian, n_clusters=n_clusters) if algo == 'kmeans++': _, labels, _, num_iterations = k_means(eigen_vectors, n_clusters=n_clusters, return_n_iter=True) else: if n_clusters == 2: # cluster based on sign second_eigen_vector_index = np.argsort(eigen_values)[1] second_eigen_vector = eigen_vectors.T[second_eigen_vector_index] labels = [0 if val <= 0 else 1 for val in second_eigen_vector] # use only the second eigenvector num_iterations = 1 else: # bisecting it into k-ways, use all eigenvectors labels = discretize(eigen_vectors) num_iterations = 20 # assume worst case scenario that it tooks 20 restarts end = timer() execution_time = end - start smallest_cluster_size = min(np.sum(labels), abs(np.sum(labels) - len(labels))) return labels, num_iterations, smallest_cluster_size, execution_time
def spectral_clustering_sg(self, affinity, max_clusters=8, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans'): if assign_labels not in ('kmeans', 'discretize'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) random_state = check_random_state(random_state) n_components = max_clusters maps, lambdas = self.spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) # determin n_clusters by Spectral Gap HERE!! n_clusters = self.estimate_num_of_clusters(lambdas) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=0, n_init=n_init) else: labels = discretize(maps, random_state=random_state) return labels
def signed_spectral_clustering(affinity, random_state=None, n_clusters=2, eigen_tol=0.0): maps = signed_spectral_embedding(affinity, random_state, n_clusters, eigen_tol) clusters = discretize(maps, random_state=random_state) return clusters
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', fuzzy_m=2, fuzzy_error=0.0005, fuzzy_maxiter=10000, fuzzy_label_threshold=None): if assign_labels not in ('kmeans', 'fuzzy_cmeans', 'discretize'): raise ValueError( "The 'assign_labels' parameter should be " "'kmeans', 'fuzzy_cmeans' or 'discretize', but '%s' was given" % assign_labels) random_state_ = sp.check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = sp.k_means(maps, n_clusters, random_state=random_state_, n_init=n_init) elif assign_labels == 'fuzzy_cmeans': if fuzzy_label_threshold is None: fuzzy_label_threshold = 1. / n_clusters _, u, _, _, _, _, _ = fuzz.cluster.cmeans(np.exp(maps.T), n_clusters, seed=random_state, m=fuzzy_m, error=fuzzy_error, maxiter=fuzzy_maxiter) # from sklearn.mixture import GMM # gmm = GMM(n_components=n_clusters, covariance_type='full', random_state=random_state, n_init=n_init).fit(maps) # u = gmm.predict_proba(maps) # u = u.T assignments = np.argwhere(u.T >= fuzzy_label_threshold) labels = [[] for _ in range(u.shape[1])] for row in assignments: labels[row[0]].append(row[1]) else: labels = sp.discretize(maps, random_state=random_state_) return labels
def fit(self, X, y=None): """Creates an affinity matrix for X using the selected affinity, then applies spectral clustering to this affinity matrix. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) OR, if affinity==`precomputed`, a precomputed affinity matrix of shape (n_samples, n_samples) """ # this class is not tested with sparse matrix. # any contribution (report, coding) is welcome! X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) ell = self.n_clusters + 1 # +1 for drop_first, x2 for zero suppression in frequent_direction. k = self.n_buffer_rows if self.affinity == 'rbf': self.affinity_matrix_, dd = laplacian_sketch_rbf_kernel( X, ell, k, normed=self.normed, gamma=self.gamma) elif self.affinity == 'cosine': self.affinity_matrix_, dd = laplacian_sketch_cosine_similarity( X, ell, k, normed=self.normed) else: params = self.kernel_params if params is None: params = {} if callable(self.affinity): self.affinity_matrix_, dd = laplacian_sketch( X, ell, k, False, self.normed, self.affinity, params) else: warnings.warn("%s is unknown kernel" % self.affinity) random_state = check_random_state(self.random_state) # spectral embedding post process. maps = spectral_embedding_imitation(self.affinity_matrix_, dd, n_components=self.n_clusters, random_state=random_state, drop_first=False) if self.assign_labels == 'kmeans': _, self.labels_, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) else: self.labels_ = discretize(maps, random_state=random_state)
def test_discretize(seed=36): # Test the discretize using a noise assignment matrix LB = LabelBinarizer() for n_sample in [50, 100, 150, 500]: for n_class in range(2, 10): # random class labels random_state = np.random.RandomState(seed) y_true = random_state.random_integers(0, n_class, n_sample) y_true = np.array(y_true, np.float) # noise class assignment matrix y_true_noisy = (LB.fit_transform(y_true) + 0.1 * random_state.randn(n_sample, n_class + 1)) y_pred = discretize(y_true_noisy) assert_greater(adjusted_rand_score(y_true, y_pred), 0.9)
def test_discretize(seed=36): # Test the discretize using a noise assignment matrix LB = LabelBinarizer() for n_sample in [50, 100, 150, 500]: for n_class in range(2, 10): # random class labels random_state = np.random.RandomState(seed) y_true = random_state.random_integers(0, n_class, n_sample) y_true = np.array(y_true, np.float) # noise class assignment matrix y_true_noisy = (LB.fit_transform(y_true) + 0.1 * random_state.randn(n_sample, n_class + 1)) y_pred = discretize(y_true_noisy, random_state) assert_greater(adjusted_rand_score(y_true, y_pred), 0.9)
def test_discretize(seed=8): # Test the discretize using a noise assignment matrix random_state = np.random.RandomState(seed) for n_samples in [50, 100, 150, 500]: for n_class in range(2, 10): # random class labels y_true = random_state.random_integers(0, n_class, n_samples) y_true = np.array(y_true, np.float) # noise class assignment matrix y_indicator = sparse.coo_matrix( (np.ones(n_samples), (np.arange(n_samples), y_true)), shape=(n_samples, n_class + 1) ) y_true_noisy = y_indicator.todense() + 0.1 * random_state.randn(n_samples, n_class + 1) y_pred = discretize(y_true_noisy, random_state) assert_greater(adjusted_rand_score(y_true, y_pred), 0.8)
def test_discretize(n_samples): # Test the discretize using a noise assignment matrix random_state = np.random.RandomState(seed=8) for n_class in range(2, 10): # random class labels y_true = random_state.randint(0, n_class + 1, n_samples) y_true = np.array(y_true, np.float) # noise class assignment matrix y_indicator = sparse.coo_matrix( (np.ones(n_samples), (np.arange(n_samples), y_true)), shape=(n_samples, n_class + 1)) y_true_noisy = (y_indicator.toarray() + 0.1 * random_state.randn(n_samples, n_class + 1)) y_pred = discretize(y_true_noisy, random_state) assert adjusted_rand_score(y_true, y_pred) > 0.8
def Discretize(V, **kwargs): try: from sklearn.cluster.spectral import discretize except ImportError: raise ImportError('Use of this function (Discretize) requires the ' 'installation of sklearn.') copy = kwargs.pop('copy', True) max_svd_restarts = kwargs.pop('max_svd_restarts', 30) n_iter_max = kwargs.pop('n_iter_max', 20) random_state = kwargs.pop('random_state', None) labels = discretize(V, copy=copy, max_svd_restarts=max_svd_restarts, n_iter_max=n_iter_max, random_state=random_state) return labels
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', size_min=None, size_max=None): if assign_labels not in ('kmeans', 'neo-kmeans', 'discretize'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components # The first eigen vector is constant only for fully connected graphs # and should be kept for spectral clustering (drop_first = False) # See spectral_embedding documentation. maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False) if assign_labels == 'kmeans': _, labels, _ = k_means_constrained(maps, n_clusters, random_state=random_state, n_init=n_init, size_min=size_min, size_max=size_max) elif assign_labels == 'neo-kmeans': raise ValueError( f"assign_labels: {assign_labels} is not currently supported.") else: labels = discretize(maps, random_state=random_state) return labels
def spectral_hg_partitioning(hg, n_clusters, assign_labels='kmeans', n_components=None, random_state=None, n_init=10): """ :param hg: instance of HyperG :param n_clusters: int, :param assign_labels: str, {'kmeans', 'discretize'}, default: 'kmeans' :param n_components: int, number of eigen vectors to use for the spectral embedding :param random_state: int or None (default) :param n_init: int, number of time the k-means algorithm will be run with different centroid seeds. :return: numpy array, shape = (n_samples,), labels of each point """ assert isinstance(hg, HyperG) assert n_clusters <= hg.num_nodes() random_state = check_random_state(random_state) if n_components is None: n_components = n_clusters L = hg.laplacian().toarray() L = check_symmetric(L) eigenval, eigenvec = eigh(L) embeddings = eigenvec[:, :n_components] if assign_labels == 'kmeans': _, labels, _ = k_means(embeddings, n_clusters, random_state=random_state, n_init=n_init) else: labels = discretize(embeddings, random_state=random_state) return labels
def parcel_selection(X, grp_mask, write_dir='/tmp/', method='ward', k_range=KRANGE, criterion='ll', verbose=True): """ Functiond edicated to parcel selection """ # Define the structure A of the data. Pixels connected to their neighbors. n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) X_ = PCA(n_components=n_components).fit_transform(Xv) i, j = connectivity.nonzero() sigma = np.sum((Xv[i] - Xv[j]) ** 2, 1).mean() if method == 'spectral': i, j = connectivity.nonzero() sigma = np.sum((Xv[i] - Xv[j]) ** 2, 1).mean() connectivity.data = np.exp( - np.sum((Xv[i] - Xv[j]) ** 2, 1) / (2 * sigma)) maps = spectral_embedding(connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False) del Xv # parcel selection all_bic = {} all_crit = {} for k in k_range: if method == 'ward': ward = Ward(n_clusters=k, connectivity=connectivity).fit(X_) labels = ward.labels_ elif method == 'spectral': if k <= n_components: for i in range(10): labels = discretize(maps[:, :k]) if len(np.unique(labels)) == k: break else: _, labels, _ = k_means(maps[:, :100], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(X_, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'gmm': from sklearn.mixture import GMM labels = GMM(n_components=k, covariance_type='spherical', n_iter=10, n_init=1).fit(X_).predict(X_) ll, bic = 0, 0 for contrast in range(n_contrasts): ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map( X[:, contrast], labels, null=False) bic += bic_.sum() if criterion == 'log-LR': ll2, _, _, _, bic_ = parameter_map( X[:, contrast], labels, null=True) ll += np.sum((ll1 - ll2)) elif criterion == 'll': ll += np.sum(ll1) elif criterion == 'sigma': ll = (sigma1_.mean(), sigma2_.mean()) elif criterion == 'kfold': ll += score_spatial_model(X[:, contrast], labels, cv='kfold') all_crit[k] = ll all_bic[k] = bic if verbose: print 'k: ', k, ' bic: ', bic, ' crit: ', ll if criterion == 'log-LR': file = open(path.join( write_dir, 'all_llr_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'll': file = open(path.join( write_dir, 'all_ll_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'sigma': file = open(path.join( write_dir, 'all_sigma_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'kfold': file = open(path.join( write_dir, 'all_kfold_%s.pck' % method), 'w') pickle.dump(all_crit, file) file = open(path.join( write_dir, 'all_bic_%s.pck' % method), 'w') pickle.dump(all_bic, file) return all_crit, all_bic
def parcel_cv(X, grp_mask, write_dir='/tmp/', method='ward', n_folds=10, k_range=KRANGE, verbose=True): """ Functiond edicated to parcel selection using 10-fold cross-validation""" from sklearn.cross_validation import KFold, ShuffleSplit # Define the structure A of the data. Pixels connected to their neighbors. n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() ic, jc = connectivity.nonzero() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) sigma = np.sum((Xv[ic] - Xv[jc]) ** 2, 1).mean() # pre-compute PCA for the cross_validation loops if n_folds == int(n_folds): cv = KFold(X.shape[2], n_folds) else: cv = ShuffleSplit(X.shape[2], 10, .2) maps = [] for (train, test) in cv: X_ = np.reshape(X[:, :, train], (n_voxels, n_contrasts * len(train))) if method == 'spectral': connectivity.data = np.exp( - np.sum((X_[ic] - X_[jc]) ** 2, 1) / (2 * sigma)) maps.append(spectral_embedding( connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False)) else: maps.append(PCA(n_components=n_components).fit_transform(X_)) # parcel selection all_crit = {} for k in k_range: ll, ll_cv = 0, 0 for (it, (train, test)) in enumerate(cv): if method == 'ward': ward = Ward(n_clusters=k, connectivity=connectivity).fit(maps[it]) labels = ward.labels_ elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(maps[it], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'spectral': if k <= n_components: for i in range(10): labels = discretize(maps[it][:, :k]) if len(np.unique(labels)) == k: break else: _, labels, _ = k_means( maps[it], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) for contrast in range(n_contrasts): ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map( X[:, contrast, train], labels, null=False) ll += ll1.sum() ll2 = log_likelihood_map( X[:, contrast, test], labels, mu_, sigma1_, sigma2_) ll_cv += ll2.sum() all_crit[k] = ll_cv if verbose: print 'k: ', k, 'll: ', ll, ' ll_cv: ', ll_cv file = open(path.join( write_dir, 'll_cv_%s.pck' % method), 'w') pickle.dump(all_crit, file) return all_crit
def parcel_selection(X, grp_mask, write_dir='/tmp/', method='ward', k_range=KRANGE, criterion='ll', verbose=True): """ Functiond edicated to parcel selection """ # Define the structure A of the data. Pixels connected to their neighbors. n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) X_ = PCA(n_components=n_components).fit_transform(Xv) i, j = connectivity.nonzero() sigma = np.sum((Xv[i] - Xv[j])**2, 1).mean() if method == 'spectral': i, j = connectivity.nonzero() sigma = np.sum((Xv[i] - Xv[j])**2, 1).mean() connectivity.data = np.exp(-np.sum( (Xv[i] - Xv[j])**2, 1) / (2 * sigma)) maps = spectral_embedding(connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False) del Xv # parcel selection all_bic = {} all_crit = {} for k in k_range: if method == 'ward': ward = Ward(n_clusters=k, connectivity=connectivity).fit(X_) labels = ward.labels_ elif method == 'spectral': if k <= n_components: for i in range(10): labels = discretize(maps[:, :k]) if len(np.unique(labels)) == k: break else: _, labels, _ = k_means(maps[:, :100], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(X_, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'gmm': from sklearn.mixture import GMM labels = GMM(n_components=k, covariance_type='spherical', n_iter=10, n_init=1).fit(X_).predict(X_) ll, bic = 0, 0 for contrast in range(n_contrasts): ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map(X[:, contrast], labels, null=False) bic += bic_.sum() if criterion == 'log-LR': ll2, _, _, _, bic_ = parameter_map(X[:, contrast], labels, null=True) ll += np.sum((ll1 - ll2)) elif criterion == 'll': ll += np.sum(ll1) elif criterion == 'sigma': ll = (sigma1_.mean(), sigma2_.mean()) elif criterion == 'kfold': ll += score_spatial_model(X[:, contrast], labels, cv='kfold') all_crit[k] = ll all_bic[k] = bic if verbose: print 'k: ', k, ' bic: ', bic, ' crit: ', ll if criterion == 'log-LR': file = open(path.join(write_dir, 'all_llr_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'll': file = open(path.join(write_dir, 'all_ll_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'sigma': file = open(path.join(write_dir, 'all_sigma_%s.pck' % method), 'w') pickle.dump(all_crit, file) elif criterion == 'kfold': file = open(path.join(write_dir, 'all_kfold_%s.pck' % method), 'w') pickle.dump(all_crit, file) file = open(path.join(write_dir, 'all_bic_%s.pck' % method), 'w') pickle.dump(all_bic, file) return all_crit, all_bic
def spectral_clustering(affinity, n_clusters=8, n_components=None, eigen_solver=None, random_state=None, n_init=10, eigen_tol=0.0, assign_labels='kmeans', norm_laplacian=True): """Apply clustering to a projection to the normalized laplacian. In practice Spectral Clustering is very useful when the structure of the individual clusters is highly non-convex or more generally when a measure of the center and spread of the cluster is not a suitable description of the complete cluster. For instance when clusters are nested circles on the 2D plan. If affinity is the adjacency matrix of a graph, this method can be used to find normalized graph cuts. Read more in the :ref:`User Guide <spectral_clustering>`. Parameters ----------- affinity : array-like or sparse matrix, shape: (n_samples, n_samples) The affinity matrix describing the relationship of the samples to embed. **Must be symmetric**. Possible examples: - adjacency matrix of a graph, - heat kernel of the pairwise distance matrix of the samples, - symmetric k-nearest neighbours connectivity matrix of the samples. n_clusters : integer, optional Number of clusters to extract. n_components : integer, optional, default is n_clusters Number of eigen vectors to use for the spectral embedding eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} The eigenvalue decomposition strategy to use. AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by the K-Means initialization. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. eigen_tol : float, optional, default: 0.0 Stopping criterion for eigendecomposition of the Laplacian matrix when using arpack eigen_solver. assign_labels : {'kmeans', 'discretize'}, default: 'kmeans' The strategy to use to assign labels in the embedding space. There are two ways to assign labels after the laplacian embedding. k-means can be applied and is a popular choice. But it can also be sensitive to initialization. Discretization is another approach which is less sensitive to random initialization. See the 'Multiclass spectral clustering' paper referenced below for more details on the discretization approach. Returns ------- labels : array of integers, shape: n_samples The labels of the clusters. References ---------- - Normalized cuts and image segmentation, 2000 Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 - A Tutorial on Spectral Clustering, 2007 Ulrike von Luxburg http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 - Multiclass spectral clustering, 2003 Stella X. Yu, Jianbo Shi http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf Notes ------ The graph should contain only one connect component, elsewhere the results make little sense. This algorithm solves the normalized cut for k=2: it is a normalized spectral clustering. """ if assign_labels not in ('kmeans', 'discretize'): raise ValueError("The 'assign_labels' parameter should be " "'kmeans' or 'discretize', but '%s' was given" % assign_labels) random_state = check_random_state(random_state) n_components = n_clusters if n_components is None else n_components maps = spectral_embedding(affinity, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, eigen_tol=eigen_tol, drop_first=False, norm_laplacian=norm_laplacian) if assign_labels == 'kmeans': _, labels, _ = k_means(maps, n_clusters, random_state=random_state, n_init=n_init) else: labels = discretize(maps, random_state=random_state) return labels
def reproducibility_selection(X, grp_mask, niter=2, method='ward', k_range=KRANGE, write_dir='/tmp', verbose=True): """ Returns a reproducibility metric on bootstraped models Parameters ---------- X: array of shape (n_voxels, n_contrasts, n_subjects) the input data grp_mask: array of shape (image_shape), the non-zeros elements yield the spatial model niter: int, number of bootstrap samples estimated method: string, one of 'ward', 'kmeans', 'spectral' k_range: list of ints, the possible number of parcels to be tested """ n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) # pre-computed stuff ic, jc = connectivity.nonzero() sigma = np.sum((Xv[ic] - Xv[jc])**2, 1).mean() maps = [] for i in range(niter): bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int) X_ = Xv[:, bootstrap] if method == 'spectral': connectivity.data = np.exp(-np.sum( (X_[ic] - X_[jc])**2, 1) / (2 * sigma)) maps.append( spectral_embedding(connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False)) else: maps.append(PCA(n_components=n_components).fit_transform(X_)) ars_score = {} ami_score = {} vm_score = {} for (ik, k_) in enumerate(k_range): label_ = [] for i in range(niter): bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int) if method == 'spectral': if k_ <= n_components: for _ in range(10): labels = discretize(maps[i][:, :k_]) if len(np.unique(labels)) == k_: break else: _, labels, _ = k_means(maps[i], n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) elif method == 'ward': ward = Ward(n_clusters=k_, connectivity=connectivity).fit(maps[i]) labels = ward.labels_ elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(maps[i], n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) label_.append(labels) ars_score[k_] = reproducibility_rating(label_, 'ars') ami_score[k_] = reproducibility_rating(label_, 'ami') vm_score[k_] = reproducibility_rating(label_, 'vm') if verbose: print 'k: ', k_, ' ari: ', ars_score[k_], 'ami: ',ami_score[k_],\ ' vm: ', vm_score[k_] file = open(path.join(write_dir, 'ari_score_%s.pck' % method), 'w') pickle.dump(ars_score, file) file = open(path.join(write_dir, 'ami_score_%s.pck' % method), 'w') pickle.dump(ami_score, file) file = open(path.join(write_dir, 'vm_score_%s.pck' % method), 'w') pickle.dump(vm_score, file) return ars_score, ami_score, vm_score
def parcel_cv(X, grp_mask, write_dir='/tmp/', method='ward', n_folds=10, k_range=KRANGE, verbose=True): """ Functiond edicated to parcel selection using 10-fold cross-validation""" from sklearn.cross_validation import KFold, ShuffleSplit # Define the structure A of the data. Pixels connected to their neighbors. n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() ic, jc = connectivity.nonzero() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) sigma = np.sum((Xv[ic] - Xv[jc])**2, 1).mean() # pre-compute PCA for the cross_validation loops if n_folds == int(n_folds): cv = KFold(X.shape[2], n_folds) else: cv = ShuffleSplit(X.shape[2], 10, .2) maps = [] for (train, test) in cv: X_ = np.reshape(X[:, :, train], (n_voxels, n_contrasts * len(train))) if method == 'spectral': connectivity.data = np.exp(-np.sum( (X_[ic] - X_[jc])**2, 1) / (2 * sigma)) maps.append( spectral_embedding(connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False)) else: maps.append(PCA(n_components=n_components).fit_transform(X_)) # parcel selection all_crit = {} for k in k_range: ll, ll_cv = 0, 0 for (it, (train, test)) in enumerate(cv): if method == 'ward': ward = Ward(n_clusters=k, connectivity=connectivity).fit(maps[it]) labels = ward.labels_ elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(maps[it], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'spectral': if k <= n_components: for i in range(10): labels = discretize(maps[it][:, :k]) if len(np.unique(labels)) == k: break else: _, labels, _ = k_means(maps[it], n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k, n_init=1, precompute_distances=False, max_iter=10) for contrast in range(n_contrasts): ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map(X[:, contrast, train], labels, null=False) ll += ll1.sum() ll2 = log_likelihood_map(X[:, contrast, test], labels, mu_, sigma1_, sigma2_) ll_cv += ll2.sum() all_crit[k] = ll_cv if verbose: print 'k: ', k, 'll: ', ll, ' ll_cv: ', ll_cv file = open(path.join(write_dir, 'll_cv_%s.pck' % method), 'w') pickle.dump(all_crit, file) return all_crit
def reproducibility_selection( X, grp_mask, niter=2, method='ward', k_range=KRANGE, write_dir='/tmp', verbose=True): """ Returns a reproducibility metric on bootstraped models Parameters ---------- X: array of shape (n_voxels, n_contrasts, n_subjects) the input data grp_mask: array of shape (image_shape), the non-zeros elements yield the spatial model niter: int, number of bootstrap samples estimated method: string, one of 'ward', 'kmeans', 'spectral' k_range: list of ints, the possible number of parcels to be tested """ n_voxels, n_contrasts, n_subjects = X.shape n_components = 100 # Define a spatial model shape = grp_mask.shape connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr() # concatenate the data spatially Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects)) # pre-computed stuff ic, jc = connectivity.nonzero() sigma = np.sum((Xv[ic] - Xv[jc]) ** 2, 1).mean() maps = [] for i in range(niter): bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int) X_ = Xv[:, bootstrap] if method == 'spectral': connectivity.data = np.exp( - np.sum((X_[ic] - X_[jc]) ** 2, 1) / (2 * sigma)) maps.append(spectral_embedding(connectivity, n_components=n_components, eigen_solver='arpack', random_state=None, eigen_tol=0.0, drop_first=False)) else: maps.append(PCA(n_components=n_components).fit_transform(X_)) ars_score = {} ami_score = {} vm_score = {} for (ik, k_) in enumerate(k_range): label_ = [] for i in range(niter): bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int) if method == 'spectral': if k_ <= n_components: for _ in range(10): labels = discretize(maps[i][:, :k_]) if len(np.unique(labels)) == k_: break else: _, labels, _ = k_means( maps[i], n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) elif method == 'ward': ward = Ward(n_clusters=k_, connectivity=connectivity).fit(maps[i]) labels = ward.labels_ elif method in ['k-means', 'kmeans']: _, labels, _ = k_means(maps[i], n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) elif method == 'geometric': xyz = np.array(np.where(grp_mask)).T _, labels, _ = k_means(xyz, n_clusters=k_, n_init=1, precompute_distances=False, max_iter=10) label_.append(labels) ars_score[k_] = reproducibility_rating(label_, 'ars') ami_score[k_] = reproducibility_rating(label_, 'ami') vm_score[k_] = reproducibility_rating(label_, 'vm') if verbose: print 'k: ', k_, ' ari: ', ars_score[k_], 'ami: ',ami_score[k_],\ ' vm: ', vm_score[k_] file = open(path.join(write_dir, 'ari_score_%s.pck' % method), 'w') pickle.dump(ars_score, file) file = open(path.join(write_dir, 'ami_score_%s.pck' % method), 'w') pickle.dump(ami_score, file) file = open(path.join(write_dir, 'vm_score_%s.pck' % method), 'w') pickle.dump(vm_score, file) return ars_score, ami_score, vm_score