def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
    ward.fit(X)
    assert_true(np.size(np.unique(ward.labels_)) == 5)

    Xred = ward.transform(X)
    assert_true(Xred.shape[1] == 5)
    Xfull = ward.inverse_transform(Xred)
    assert_true(np.unique(Xfull[0]).size == 5)
    assert_array_almost_equal(ward.transform(Xfull), Xred)
Пример #2
0
def prepare_data(imgs, connectivity, mask, n_clusters=5000, n_components=100):
    # data preparation
    Z = nifti_masker.fit_transform(imgs)
    pca = RandomizedPCA(n_components=n_components)
    Z_ = pca.fit_transform(Z.T).T
    ward = WardAgglomeration(n_clusters=n_clusters, connectivity=connectivity,
                             memory='nilearn_cache').fit(Z_)
    W = ward.transform(Z)
    del Z
    # data cube is a more convenient representation
    cube = np.array([W[subject_label == subject]
                     for subject in np.arange(n_subjects)])
    # parcel connectivity
    parcel_connectivity = do_parcel_connectivity(mask, n_clusters, ward)
    return cube, ward, parcel_connectivity
Пример #3
0
def prepare_data(imgs, connectivity, mask, n_clusters=5000, n_components=100):
    # data preparation
    Z = nifti_masker.fit_transform(imgs)
    pca = RandomizedPCA(n_components=n_components)
    Z_ = pca.fit_transform(Z.T).T
    ward = WardAgglomeration(n_clusters=n_clusters,
                             connectivity=connectivity,
                             memory='nilearn_cache').fit(Z_)
    W = ward.transform(Z)
    del Z
    # data cube is a more convenient representation
    cube = np.array(
        [W[subject_label == subject] for subject in np.arange(n_subjects)])
    # parcel connectivity
    parcel_connectivity = do_parcel_connectivity(mask, n_clusters, ward)
    return cube, ward, parcel_connectivity
Пример #4
0
first_epi = nifti_masker.inverse_transform(fmri_masked[0]).get_data()
first_epi = np.ma.masked_array(first_epi, first_epi == 0)
# Outside the mask: a uniform value, smaller than inside the mask
first_epi[np.logical_not(mask)] = 0.9 * first_epi[mask].min()
vmax = first_epi[..., 20].max()
vmin = first_epi[..., 20].min()
pl.imshow(np.rot90(first_epi[..., 20]),
          interpolation='nearest', cmap=pl.cm.spectral, vmin=vmin, vmax=vmax)
pl.axis('off')
pl.title('Original (%i voxels)' % fmri_masked.shape[1])

# A reduced data can be create by taking the parcel-level average:
# Note that, as many objects in the scikit-learn, the ward object exposes
# a transform method that modifies input features. Here it reduces their
# dimension
fmri_reduced = ward.transform(fmri_masked)

# Display the corresponding data compressed using the parcellation
fmri_compressed = ward.inverse_transform(fmri_reduced)
compressed = nifti_masker.inverse_transform(
    fmri_compressed[0]).get_data()
compressed = np.ma.masked_equal(compressed, 0)


pl.figure()
pl.imshow(np.rot90(compressed[:, :, 20]),
          interpolation='nearest', cmap=pl.cm.spectral, vmin=vmin, vmax=vmax)
pl.title('Compressed representation (2000 parcels)')
pl.axis('off')
pl.show()
Пример #5
0
first_plot = plot_roi(labels_img,
                      mean_func_img,
                      title="Ward parcellation",
                      display_mode='xz')
# labels_img is a Nifti1Image object, it can be saved to file with the
# following code:
labels_img.to_filename('parcellation.nii')

# Display the original data
plot_epi(nifti_masker.inverse_transform(fmri_masked[0]),
         cut_coords=first_plot.cut_coords,
         title='Original (%i voxels)' % fmri_masked.shape[1],
         display_mode='xz')

# A reduced data can be create by taking the parcel-level average:
# Note that, as many objects in the scikit-learn, the ward object exposes
# a transform method that modifies input features. Here it reduces their
# dimension
fmri_reduced = ward.transform(fmri_masked)

# Display the corresponding data compressed using the parcellation
fmri_compressed = ward.inverse_transform(fmri_reduced)
compressed_img = nifti_masker.inverse_transform(fmri_compressed[0])

plot_epi(compressed_img,
         cut_coords=first_plot.cut_coords,
         title='Compressed representation (2000 parcels)',
         display_mode='xz')

plt.show()
Пример #6
0
def feature_extractor(imgfile,
                      maskfile,
                      featurefile,
                      maskerfile,
                      wardfile,
                      nclusters=[
                          1000,
                      ],
                      selectfile=None,
                      targetfile=None,
                      metafile=None,
                      cachefile=None):

    resultdict = {"imgfile": imgfile, "maskfile": maskfile}
    # load data
    print "--loading data"
    nifti_masker = input_data.NiftiMasker(mask=maskfile,
                                          memory=cachefile,
                                          memory_level=1,
                                          standardize=False)
    fmri_masked = nifti_masker.fit_transform(imgfile)
    print "--getting mask"
    mask = nifti_masker.mask_img_.get_data().astype(np.bool)

    # saveit
    joblib.dump(nifti_masker, maskerfile)
    resultdict["mask"] = mask
    resultdict["Xmask"] = fmri_masked
    resultdict["maskerfile"] = maskerfile

    # get connectivity
    print "--getting connectivity"
    shape = mask.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask)
    # saveit
    resultdict["connectivity"] = connectivity
    print "--save main file"
    np.savez(featurefile + "_main.npz", **resultdict)

    # run  ward
    y = np.load(targetfile)["ymap"]
    meta = np.load(metafile)
    train = meta["train"]
    test = meta["test"]
    ncv = meta['ycv']

    # for each cv set
    for cvx in range(ncv):
        trainidx = train[cvx]
        testidx = test[cvx]
        resultdict = {}
        wardfiles = []
        selectfiles = []
        print "--Running ward %d" % (cvx, )
        for ix, nc in enumerate(nclusters):
            ward = WardAgglomeration(n_clusters=nc,
                                     connectivity=connectivity,
                                     memory=cachefile)
            ward.fit(fmri_masked[trainidx])
            fmri_reduced_train = ward.transform(fmri_masked[trainidx])
            fmri_reduced_test = ward.transform(fmri_masked[testidx])

            # saveit
            subwardfile = wardfile + "_D%d_cv%d.pkl" % (
                nc,
                cvx,
            )
            joblib.dump(ward, subwardfile)
            resultdict["Xward_%d_train" % (nc, )] = fmri_reduced_train
            resultdict["Xward_%d_test" % (nc, )] = fmri_reduced_test
            wardfiles.append(subwardfile)

            # additional feature selection
            selector = SelectPercentile(f_classif, percentile=30)
            selector.fit(fmri_reduced_train, y[trainidx])
            fmri_select_train = selector.transform(fmri_reduced_train)
            fmri_select_test = selector.transform(fmri_reduced_test)

            # saveit
            subselectfile = selectfile + "_D%d_cv%d.pkl" % (
                nc,
                cvx,
            )
            joblib.dump(selector, subselectfile)
            resultdict["Xselect_%d_train" % (nc, )] = fmri_select_train
            resultdict["Xselect_%d_test" % (nc, )] = fmri_select_test
            selectfiles.append(subselectfile)

        resultdict["wardfiles"] = wardfiles
        resultdict["selectfiles"] = selectfiles

        # save results
        print "--save cv result"
        np.savez(featurefile + "_cv%d.npz" % (cvx, ), **resultdict)
labels[mask] = ward.labels_

cut = labels[:, :, 20].astype(np.int)
colors = np.random.random(size=(ward.n_clusters + 1, 3))
colors[-1] = 0
pl.axis('off')
pl.imshow(colors[cut], interpolation='nearest')
pl.title('Ward parcellation')

# Display the original data
pl.figure()
first_epi_img = epi_img[..., 0].copy()
first_epi_img[np.logical_not(mask)] = 0
pl.imshow(first_epi_img[..., 20], interpolation='nearest',
           cmap=pl.cm.spectral)
pl.axis('off')
pl.title('Original')

# Display the corresponding data compressed using the parcellation
X_r = ward.transform(epi_masked.T)
X_c = ward.inverse_transform(X_r)
compressed_img = np.zeros(mask.shape)
compressed_img[mask] = X_c[0]

pl.figure()
pl.imshow(compressed_img[:, :, 20], interpolation='nearest',
           cmap=pl.cm.spectral)
pl.title('Compressed representation')
pl.axis('off')
pl.show()
Пример #8
0
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity,
                        n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # XXX: Delayed import is a mega hack which is unfortunately
    # required. In scipy versions < 0.11, this import ends up
    # importing matplotlib.pyplot. This sets the matplotlib backend
    # which causes our matplotlib backend setting code in
    # nilearn/plotting/__init__.py to have no effect. In environment
    # without X, e.g. travis-ci, that means the tests will fail with
    # the usual "TclError: no display name and no $DISPLAY environment
    # variable". Note this is dependent on the order of import,
    # whichever comes first has the only shot at setting the
    # matplotlib backend.
    from sklearn.cluster import WardAgglomeration

    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = WardAgglomeration(n_clusters=n_parcels, connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels
Пример #9
0
def feature_extractor(imgfile, maskfile, featurefile, maskerfile, wardfile, nclusters=[1000,], selectfile=None, targetfile=None, metafile=None, cachefile=None):
    
    resultdict = {"imgfile":imgfile, "maskfile":maskfile}
    # load data
    print "--loading data"
    nifti_masker = input_data.NiftiMasker(mask=maskfile, memory=cachefile, memory_level=1,
                              standardize=False)
    fmri_masked = nifti_masker.fit_transform(imgfile)
    print "--getting mask"
    mask = nifti_masker.mask_img_.get_data().astype(np.bool)
    
    # saveit
    joblib.dump(nifti_masker, maskerfile)
    resultdict["mask"]  = mask
    resultdict["Xmask"] = fmri_masked
    resultdict["maskerfile"] = maskerfile
    
    # get connectivity
    print "--getting connectivity"
    shape = mask.shape
    connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1],
                                   n_z=shape[2], mask=mask)
    # saveit
    resultdict["connectivity"]    = connectivity
    print "--save main file"
    np.savez(featurefile+"_main.npz", **resultdict)
    
    # run  ward
    y     = np.load(targetfile)["ymap"]
    meta  = np.load(metafile)
    train = meta["train"]
    test  = meta["test"]
    ncv   = meta['ycv']
    
    # for each cv set
    for cvx in range(ncv):
        trainidx = train[cvx]
        testidx  = test[cvx]
        resultdict = {}        
        wardfiles = []
        selectfiles = []
        print "--Running ward %d"%(cvx, )
        for ix, nc in enumerate(nclusters):
            ward = WardAgglomeration(n_clusters=nc, connectivity=connectivity, memory=cachefile)
            ward.fit(fmri_masked[trainidx])
            fmri_reduced_train = ward.transform(fmri_masked[trainidx])
            fmri_reduced_test  = ward.transform(fmri_masked[testidx])
            
            # saveit
            subwardfile = wardfile+"_D%d_cv%d.pkl"%(nc, cvx,)
            joblib.dump(ward, subwardfile)
            resultdict["Xward_%d_train"%(nc,)] = fmri_reduced_train
            resultdict["Xward_%d_test"%(nc,)]  = fmri_reduced_test
            wardfiles.append(subwardfile)
            
            # additional feature selection
            selector = SelectPercentile(f_classif, percentile=30)
            selector.fit(fmri_reduced_train, y[trainidx])
            fmri_select_train = selector.transform(fmri_reduced_train)
            fmri_select_test  = selector.transform(fmri_reduced_test)
            
            # saveit
            subselectfile = selectfile+"_D%d_cv%d.pkl"%(nc, cvx,)
            joblib.dump(selector, subselectfile)
            resultdict["Xselect_%d_train"%(nc,)] = fmri_select_train
            resultdict["Xselect_%d_test"%(nc,)]  = fmri_select_test
            selectfiles.append(subselectfile)
            
        resultdict["wardfiles"]   = wardfiles
        resultdict["selectfiles"] = selectfiles
        
        # save results
        print "--save cv result"
        np.savez(featurefile+"_cv%d.npz"%(cvx, ), **resultdict)
Пример #10
0
def classify(x, y, classifier='naive_bayes', clustering=True, n_folds=10):
    """
    Given the predictors and labels, performs single-class
    classification with the given classifier using n-fold
    c.v. Constructs a OvO classifier for every pair of terms.
    
    Parameters
    -----------
    x : `numpy.ndarray`
        (n_samples x n_features) array of features
    y : `numpy.ndarray`
        (1 x n_samples) array of labels
    classifier : str, optional
        which classifier model to use. Must be one of 'naive_bayes'| 'svm' | 'logistic_regression' | 'ensemble'.
        Defaults to the original naive_bayes.
    clustering : bool, optional
        whether to do Ward clustering or not. Uses n_clusters = 10,000. Change global N_CLUSTERS for different
        value. Defaults to True.
    n_folds : int
        the number of fold of cv
        
    Returns
    -------
    accuracy : `numpy.ndarray`
        The results are stored as a list of confusion matrices for each fold and saved
        as a numpy array of arrays, for further analysis.
    """
    clf = None
    ward = None
    le = preprocessing.LabelEncoder()
    le.fit(y)
    y_new = le.transform(y)
    
    # choose and assign appropriate classifier
    classifier_dict = { 'naive_bayes' : MultinomialNB(),
                        'logistic_regression' : LogisticRegression(penalty='l2'),
                        'svm' : GridSearchCV(LinearSVC(), [{'C': [1, 10, 100, 1000]}])  
                       }
    if classifier == 'ensemble':
      clf_nb = classifier_dict['naive_bayes']
      clf_svm = classifier_dict['svm']
      clf_lr = classifier_dict['logistic_regression']
    else:
        clf = classifier_dict[classifier]
        
    # perform ward clustering if specified    
    if clustering:
        mask = np.load('data/2mm_brain_mask.npy')
        shape = mask.shape
        connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask)
        ward = WardAgglomeration(n_clusters=N_CLUSTERS, connectivity=connectivity)
    
    # actual cross validation    
    kf = cross_validation.KFold(len(y_new), n_folds=n_folds)
    accuracy = []
    for train, test in kf:
        x_train = x[train]
        y_train  = y_new[train]
        x_test = x[test]
        y_test = y_new[test] 
        if clustering:
            ward.fit(x_train)
            x_train = ward.transform(x_train)
            x_test = ward.transform(x_test)
        if classifier != 'ensemble':        
            predicted = clf.fit(x_train, y_train).predict(x_test)
        else:
            predicted_nb = clf_nb.fit(x_train, y_train).predict(x_test)
            predicted_lr = clf_lr.fit(x_train, y_train).predict(x_test)
            predicted_svm = clf_svm.fit(x_train, y_train).predict(x_test)
            predicted = predicted_nb + predicted_lr + predicted_svm
            predicted = np.array(predicted >= 2, dtype=int)
        conf_mat =  confusion_matrix(y_test, predicted, labels=[0,1])
        accuracy.append(conf_mat)
    return accuracy
Пример #11
0
def classify(x, y, classifier='naive_bayes', clustering=True, n_folds=10):
    """
    Given the predictors and labels, performs multi-label 
    classification with the given classifier using n-fold
    c.v. Constructs a OvR classifier for multilabel prediction.
    
    Parameters
    -----------
    x : `numpy.ndarray`
        (n_samples x n_features) array of features
    y : `numpy.ndarray`
        (n_samples x n_labels) array of labels
    classifier : str, optional
        which classifier model to use. Must be one of 'naive_bayes'| 'decision_tree' | 'logistic_regression'.
        Defaults to the original naive_bayes.
    clustering : bool, optional
        whether to do Ward clustering or not. Uses n_clusters = 10,000. Change global N_CLUSTERS for different
        value. Defaults to True.
    n_folds : int
        the number of fold of cv
        
    Returns
    -------
    score_per_label, score_per_class : tuple
        The results are stored as a tuple of two dicts, with the keywords specifying the metrics.
    """
    clf = None
    ward = None
    
    lb = preprocessing.LabelBinarizer()
    y_new = lb.fit_transform(y)
    #specify connectivity for clustering
    mask = nb.load('data/MNI152_T1_2mm_brain.nii.gz').get_data().astype('bool')
    shape = mask.shape
    connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask)
    ward = WardAgglomeration(n_clusters=N_CLUSTERS, connectivity=connectivity)
    
    # choose and assign appropriate classifier
    classifier_dict = { 'naive_bayes' : OneVsRestClassifier(MultinomialNB()),
                        'logistic_regression' : OneVsRestClassifier(LogisticRegression(penalty='l2')),
	                'decision_tree' : tree.DecisionTreeClassifier()                     
                       }
    
    clf = classifier_dict[classifier]
    kf = cross_validation.KFold(len(y_new), n_folds=n_folds)
    score_per_class = []
    score_per_label = []
    for train, test in kf:
        x_train = np.ascontiguousarray(x[train])
        y_train = np.ascontiguousarray(y_new[train])
        x_test = np.ascontiguousarray(x[test])
        y_test = np.ascontiguousarray(y_new[test])
        if clustering: 
            ward.fit(x_train)
            x_train = ward.transform(x_train)
            x_test = ward.transform(x_test)
        model = clf.fit(x_train, y_train)
        predicted  = model.predict(x_test)
        predict_prob = model.predict_proba(x_test)
        if isinstance(predict_prob, list):
            predict_prob = np.array(predict_prob)
        cls_scores = utils.score_results(y_test, predicted, predict_prob)
        label_scores = utils.label_scores(y_test, predicted, predict_prob)
        score_per_class.append(cls_scores)
        score_per_label.append(label_scores)
    return (score_per_class,score_per_label)