def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_true(np.size(np.unique(ward.labels_)) == 5) Xred = ward.transform(X) assert_true(Xred.shape[1] == 5) Xfull = ward.inverse_transform(Xred) assert_true(np.unique(Xfull[0]).size == 5) assert_array_almost_equal(ward.transform(Xfull), Xred)
def prepare_data(imgs, connectivity, mask, n_clusters=5000, n_components=100): # data preparation Z = nifti_masker.fit_transform(imgs) pca = RandomizedPCA(n_components=n_components) Z_ = pca.fit_transform(Z.T).T ward = WardAgglomeration(n_clusters=n_clusters, connectivity=connectivity, memory='nilearn_cache').fit(Z_) W = ward.transform(Z) del Z # data cube is a more convenient representation cube = np.array([W[subject_label == subject] for subject in np.arange(n_subjects)]) # parcel connectivity parcel_connectivity = do_parcel_connectivity(mask, n_clusters, ward) return cube, ward, parcel_connectivity
def prepare_data(imgs, connectivity, mask, n_clusters=5000, n_components=100): # data preparation Z = nifti_masker.fit_transform(imgs) pca = RandomizedPCA(n_components=n_components) Z_ = pca.fit_transform(Z.T).T ward = WardAgglomeration(n_clusters=n_clusters, connectivity=connectivity, memory='nilearn_cache').fit(Z_) W = ward.transform(Z) del Z # data cube is a more convenient representation cube = np.array( [W[subject_label == subject] for subject in np.arange(n_subjects)]) # parcel connectivity parcel_connectivity = do_parcel_connectivity(mask, n_clusters, ward) return cube, ward, parcel_connectivity
first_epi = nifti_masker.inverse_transform(fmri_masked[0]).get_data() first_epi = np.ma.masked_array(first_epi, first_epi == 0) # Outside the mask: a uniform value, smaller than inside the mask first_epi[np.logical_not(mask)] = 0.9 * first_epi[mask].min() vmax = first_epi[..., 20].max() vmin = first_epi[..., 20].min() pl.imshow(np.rot90(first_epi[..., 20]), interpolation='nearest', cmap=pl.cm.spectral, vmin=vmin, vmax=vmax) pl.axis('off') pl.title('Original (%i voxels)' % fmri_masked.shape[1]) # A reduced data can be create by taking the parcel-level average: # Note that, as many objects in the scikit-learn, the ward object exposes # a transform method that modifies input features. Here it reduces their # dimension fmri_reduced = ward.transform(fmri_masked) # Display the corresponding data compressed using the parcellation fmri_compressed = ward.inverse_transform(fmri_reduced) compressed = nifti_masker.inverse_transform( fmri_compressed[0]).get_data() compressed = np.ma.masked_equal(compressed, 0) pl.figure() pl.imshow(np.rot90(compressed[:, :, 20]), interpolation='nearest', cmap=pl.cm.spectral, vmin=vmin, vmax=vmax) pl.title('Compressed representation (2000 parcels)') pl.axis('off') pl.show()
first_plot = plot_roi(labels_img, mean_func_img, title="Ward parcellation", display_mode='xz') # labels_img is a Nifti1Image object, it can be saved to file with the # following code: labels_img.to_filename('parcellation.nii') # Display the original data plot_epi(nifti_masker.inverse_transform(fmri_masked[0]), cut_coords=first_plot.cut_coords, title='Original (%i voxels)' % fmri_masked.shape[1], display_mode='xz') # A reduced data can be create by taking the parcel-level average: # Note that, as many objects in the scikit-learn, the ward object exposes # a transform method that modifies input features. Here it reduces their # dimension fmri_reduced = ward.transform(fmri_masked) # Display the corresponding data compressed using the parcellation fmri_compressed = ward.inverse_transform(fmri_reduced) compressed_img = nifti_masker.inverse_transform(fmri_compressed[0]) plot_epi(compressed_img, cut_coords=first_plot.cut_coords, title='Compressed representation (2000 parcels)', display_mode='xz') plt.show()
def feature_extractor(imgfile, maskfile, featurefile, maskerfile, wardfile, nclusters=[ 1000, ], selectfile=None, targetfile=None, metafile=None, cachefile=None): resultdict = {"imgfile": imgfile, "maskfile": maskfile} # load data print "--loading data" nifti_masker = input_data.NiftiMasker(mask=maskfile, memory=cachefile, memory_level=1, standardize=False) fmri_masked = nifti_masker.fit_transform(imgfile) print "--getting mask" mask = nifti_masker.mask_img_.get_data().astype(np.bool) # saveit joblib.dump(nifti_masker, maskerfile) resultdict["mask"] = mask resultdict["Xmask"] = fmri_masked resultdict["maskerfile"] = maskerfile # get connectivity print "--getting connectivity" shape = mask.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask) # saveit resultdict["connectivity"] = connectivity print "--save main file" np.savez(featurefile + "_main.npz", **resultdict) # run ward y = np.load(targetfile)["ymap"] meta = np.load(metafile) train = meta["train"] test = meta["test"] ncv = meta['ycv'] # for each cv set for cvx in range(ncv): trainidx = train[cvx] testidx = test[cvx] resultdict = {} wardfiles = [] selectfiles = [] print "--Running ward %d" % (cvx, ) for ix, nc in enumerate(nclusters): ward = WardAgglomeration(n_clusters=nc, connectivity=connectivity, memory=cachefile) ward.fit(fmri_masked[trainidx]) fmri_reduced_train = ward.transform(fmri_masked[trainidx]) fmri_reduced_test = ward.transform(fmri_masked[testidx]) # saveit subwardfile = wardfile + "_D%d_cv%d.pkl" % ( nc, cvx, ) joblib.dump(ward, subwardfile) resultdict["Xward_%d_train" % (nc, )] = fmri_reduced_train resultdict["Xward_%d_test" % (nc, )] = fmri_reduced_test wardfiles.append(subwardfile) # additional feature selection selector = SelectPercentile(f_classif, percentile=30) selector.fit(fmri_reduced_train, y[trainidx]) fmri_select_train = selector.transform(fmri_reduced_train) fmri_select_test = selector.transform(fmri_reduced_test) # saveit subselectfile = selectfile + "_D%d_cv%d.pkl" % ( nc, cvx, ) joblib.dump(selector, subselectfile) resultdict["Xselect_%d_train" % (nc, )] = fmri_select_train resultdict["Xselect_%d_test" % (nc, )] = fmri_select_test selectfiles.append(subselectfile) resultdict["wardfiles"] = wardfiles resultdict["selectfiles"] = selectfiles # save results print "--save cv result" np.savez(featurefile + "_cv%d.npz" % (cvx, ), **resultdict)
labels[mask] = ward.labels_ cut = labels[:, :, 20].astype(np.int) colors = np.random.random(size=(ward.n_clusters + 1, 3)) colors[-1] = 0 pl.axis('off') pl.imshow(colors[cut], interpolation='nearest') pl.title('Ward parcellation') # Display the original data pl.figure() first_epi_img = epi_img[..., 0].copy() first_epi_img[np.logical_not(mask)] = 0 pl.imshow(first_epi_img[..., 20], interpolation='nearest', cmap=pl.cm.spectral) pl.axis('off') pl.title('Original') # Display the corresponding data compressed using the parcellation X_r = ward.transform(epi_masked.T) X_c = ward.inverse_transform(X_r) compressed_img = np.zeros(mask.shape) compressed_img[mask] = X_c[0] pl.figure() pl.imshow(compressed_img[:, :, 20], interpolation='nearest', cmap=pl.cm.spectral) pl.title('Compressed representation') pl.axis('off') pl.show()
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity, n_parcels, offset_labels): """Ward clustering algorithm on a subsample and apply to the whole dataset. Computes a brain parcellation using Ward's clustering algorithm on some images, then averages the signal within parcels in order to reduce the dimension of the images of the whole dataset. This function is used with Randomized Parcellation Based Inference, so we need to save the labels to further perform the inverse transformation operation. The function therefore needs an offset to be applied on the labels so that they are unique across parcellations. Parameters ---------- all_subjects_data : array_like, shape=(n_samples, n_voxels) Masked subject images as an array. fit_samples_indices : array-like, Indices of the samples used to compute the parcellation. connectivity : scipy.sparse.coo_matrix, Graph representing the spatial structure of the images (i.e. connections between voxels). n_parcels : int, Number of parcels for the parcellations. offset_labels : int, Offset for labels numbering. The purpose is to have different labels in all the parcellations that can be built by multiple calls to the current function. Returns ------- parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels) Average signal within each parcel for each subject. labels : np.ndarray, shape=(n_voxels,) Labels giving the correspondance between voxels and parcels. """ # XXX: Delayed import is a mega hack which is unfortunately # required. In scipy versions < 0.11, this import ends up # importing matplotlib.pyplot. This sets the matplotlib backend # which causes our matplotlib backend setting code in # nilearn/plotting/__init__.py to have no effect. In environment # without X, e.g. travis-ci, that means the tests will fail with # the usual "TclError: no display name and no $DISPLAY environment # variable". Note this is dependent on the order of import, # whichever comes first has the only shot at setting the # matplotlib backend. from sklearn.cluster import WardAgglomeration # fit part data_fit = all_subjects_data[fit_samples_indices] ward = WardAgglomeration(n_clusters=n_parcels, connectivity=connectivity) ward.fit(data_fit) # transform part labels = ward.labels_ + offset_labels # unique labels across parcellations parcelled_data = ward.transform(all_subjects_data) return parcelled_data, labels
def feature_extractor(imgfile, maskfile, featurefile, maskerfile, wardfile, nclusters=[1000,], selectfile=None, targetfile=None, metafile=None, cachefile=None): resultdict = {"imgfile":imgfile, "maskfile":maskfile} # load data print "--loading data" nifti_masker = input_data.NiftiMasker(mask=maskfile, memory=cachefile, memory_level=1, standardize=False) fmri_masked = nifti_masker.fit_transform(imgfile) print "--getting mask" mask = nifti_masker.mask_img_.get_data().astype(np.bool) # saveit joblib.dump(nifti_masker, maskerfile) resultdict["mask"] = mask resultdict["Xmask"] = fmri_masked resultdict["maskerfile"] = maskerfile # get connectivity print "--getting connectivity" shape = mask.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask) # saveit resultdict["connectivity"] = connectivity print "--save main file" np.savez(featurefile+"_main.npz", **resultdict) # run ward y = np.load(targetfile)["ymap"] meta = np.load(metafile) train = meta["train"] test = meta["test"] ncv = meta['ycv'] # for each cv set for cvx in range(ncv): trainidx = train[cvx] testidx = test[cvx] resultdict = {} wardfiles = [] selectfiles = [] print "--Running ward %d"%(cvx, ) for ix, nc in enumerate(nclusters): ward = WardAgglomeration(n_clusters=nc, connectivity=connectivity, memory=cachefile) ward.fit(fmri_masked[trainidx]) fmri_reduced_train = ward.transform(fmri_masked[trainidx]) fmri_reduced_test = ward.transform(fmri_masked[testidx]) # saveit subwardfile = wardfile+"_D%d_cv%d.pkl"%(nc, cvx,) joblib.dump(ward, subwardfile) resultdict["Xward_%d_train"%(nc,)] = fmri_reduced_train resultdict["Xward_%d_test"%(nc,)] = fmri_reduced_test wardfiles.append(subwardfile) # additional feature selection selector = SelectPercentile(f_classif, percentile=30) selector.fit(fmri_reduced_train, y[trainidx]) fmri_select_train = selector.transform(fmri_reduced_train) fmri_select_test = selector.transform(fmri_reduced_test) # saveit subselectfile = selectfile+"_D%d_cv%d.pkl"%(nc, cvx,) joblib.dump(selector, subselectfile) resultdict["Xselect_%d_train"%(nc,)] = fmri_select_train resultdict["Xselect_%d_test"%(nc,)] = fmri_select_test selectfiles.append(subselectfile) resultdict["wardfiles"] = wardfiles resultdict["selectfiles"] = selectfiles # save results print "--save cv result" np.savez(featurefile+"_cv%d.npz"%(cvx, ), **resultdict)
def classify(x, y, classifier='naive_bayes', clustering=True, n_folds=10): """ Given the predictors and labels, performs single-class classification with the given classifier using n-fold c.v. Constructs a OvO classifier for every pair of terms. Parameters ----------- x : `numpy.ndarray` (n_samples x n_features) array of features y : `numpy.ndarray` (1 x n_samples) array of labels classifier : str, optional which classifier model to use. Must be one of 'naive_bayes'| 'svm' | 'logistic_regression' | 'ensemble'. Defaults to the original naive_bayes. clustering : bool, optional whether to do Ward clustering or not. Uses n_clusters = 10,000. Change global N_CLUSTERS for different value. Defaults to True. n_folds : int the number of fold of cv Returns ------- accuracy : `numpy.ndarray` The results are stored as a list of confusion matrices for each fold and saved as a numpy array of arrays, for further analysis. """ clf = None ward = None le = preprocessing.LabelEncoder() le.fit(y) y_new = le.transform(y) # choose and assign appropriate classifier classifier_dict = { 'naive_bayes' : MultinomialNB(), 'logistic_regression' : LogisticRegression(penalty='l2'), 'svm' : GridSearchCV(LinearSVC(), [{'C': [1, 10, 100, 1000]}]) } if classifier == 'ensemble': clf_nb = classifier_dict['naive_bayes'] clf_svm = classifier_dict['svm'] clf_lr = classifier_dict['logistic_regression'] else: clf = classifier_dict[classifier] # perform ward clustering if specified if clustering: mask = np.load('data/2mm_brain_mask.npy') shape = mask.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask) ward = WardAgglomeration(n_clusters=N_CLUSTERS, connectivity=connectivity) # actual cross validation kf = cross_validation.KFold(len(y_new), n_folds=n_folds) accuracy = [] for train, test in kf: x_train = x[train] y_train = y_new[train] x_test = x[test] y_test = y_new[test] if clustering: ward.fit(x_train) x_train = ward.transform(x_train) x_test = ward.transform(x_test) if classifier != 'ensemble': predicted = clf.fit(x_train, y_train).predict(x_test) else: predicted_nb = clf_nb.fit(x_train, y_train).predict(x_test) predicted_lr = clf_lr.fit(x_train, y_train).predict(x_test) predicted_svm = clf_svm.fit(x_train, y_train).predict(x_test) predicted = predicted_nb + predicted_lr + predicted_svm predicted = np.array(predicted >= 2, dtype=int) conf_mat = confusion_matrix(y_test, predicted, labels=[0,1]) accuracy.append(conf_mat) return accuracy
def classify(x, y, classifier='naive_bayes', clustering=True, n_folds=10): """ Given the predictors and labels, performs multi-label classification with the given classifier using n-fold c.v. Constructs a OvR classifier for multilabel prediction. Parameters ----------- x : `numpy.ndarray` (n_samples x n_features) array of features y : `numpy.ndarray` (n_samples x n_labels) array of labels classifier : str, optional which classifier model to use. Must be one of 'naive_bayes'| 'decision_tree' | 'logistic_regression'. Defaults to the original naive_bayes. clustering : bool, optional whether to do Ward clustering or not. Uses n_clusters = 10,000. Change global N_CLUSTERS for different value. Defaults to True. n_folds : int the number of fold of cv Returns ------- score_per_label, score_per_class : tuple The results are stored as a tuple of two dicts, with the keywords specifying the metrics. """ clf = None ward = None lb = preprocessing.LabelBinarizer() y_new = lb.fit_transform(y) #specify connectivity for clustering mask = nb.load('data/MNI152_T1_2mm_brain.nii.gz').get_data().astype('bool') shape = mask.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask) ward = WardAgglomeration(n_clusters=N_CLUSTERS, connectivity=connectivity) # choose and assign appropriate classifier classifier_dict = { 'naive_bayes' : OneVsRestClassifier(MultinomialNB()), 'logistic_regression' : OneVsRestClassifier(LogisticRegression(penalty='l2')), 'decision_tree' : tree.DecisionTreeClassifier() } clf = classifier_dict[classifier] kf = cross_validation.KFold(len(y_new), n_folds=n_folds) score_per_class = [] score_per_label = [] for train, test in kf: x_train = np.ascontiguousarray(x[train]) y_train = np.ascontiguousarray(y_new[train]) x_test = np.ascontiguousarray(x[test]) y_test = np.ascontiguousarray(y_new[test]) if clustering: ward.fit(x_train) x_train = ward.transform(x_train) x_test = ward.transform(x_test) model = clf.fit(x_train, y_train) predicted = model.predict(x_test) predict_prob = model.predict_proba(x_test) if isinstance(predict_prob, list): predict_prob = np.array(predict_prob) cls_scores = utils.score_results(y_test, predicted, predict_prob) label_scores = utils.label_scores(y_test, predicted, predict_prob) score_per_class.append(cls_scores) score_per_label.append(label_scores) return (score_per_class,score_per_label)