Exemplo n.º 1
0
def data_compression(fmri_masked, mask_img, mask_np, output_size):
    """
    data : array_like
         A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
         The functional dataset that needs to be reduced
    mask : a numpy array of the mask
    output_size : integer
        The number of elements that the data should be reduced to
        
    """

    ## Transform nifti files to a data matrix with the NiftiMasker
    import time
    from nilearn import input_data

    datacompressiontime = time.time()
    nifti_masker = input_data.NiftiMasker(mask_img=mask_img,
                                          memory='nilearn_cache',
                                          mask_strategy='background',
                                          memory_level=1,
                                          standardize=False)

    ward = []

    # Perform Ward clustering
    from sklearn.feature_extraction import image
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    #import pdb;pdb.set_trace()
    from sklearn.cluster import FeatureAgglomeration
    start = time.time()
    ward = FeatureAgglomeration(n_clusters=output_size,
                                connectivity=connectivity,
                                linkage='ward')
    ward.fit(fmri_masked)
    #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start))

    labels = ward.labels_

    #print ('Extracting reduced Dimension Data')
    data_reduced = ward.transform(fmri_masked)
    fmri_masked = []
    #print('Data compression took ', (time.time()- datacompressiontime), ' seconds')
    return {'data': data_reduced, 'labels': labels}
Exemplo n.º 2
0
def data_compression(fmri_masked, mask_img, mask_np, compression_dim):
    # TODO @AKI update doc
    """
    Perform...
    
    Parameters
    ----------
    fmri_masked : np.ndarray[ndim=2]
           A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
           The functional dataset that needs to be reduced
    mask_img : an nibabel img object of the mask
    mask_np : a numpy array of the mask
    compression_dim : integer
        The number of elements that the data should be reduced to

    Returns
    -------
    A dictionaty ...

    """

    from sklearn.feature_extraction import image
    from sklearn.cluster import FeatureAgglomeration

    # Perform Ward clustering
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    ward = FeatureAgglomeration(n_clusters=compression_dim,
                                connectivity=connectivity,
                                linkage='ward')

    ward.fit(fmri_masked)

    labels = ward.labels_
    data_reduced = ward.transform(fmri_masked)

    return {
        'compressor': ward,
        'compressed': data_reduced,
        'labels': labels,
    }
Exemplo n.º 3
0
def cross_cluster_timeseries(data1, data2, roi_mask_nparray, n_clusters, similarity_metric, affinity_threshold, cluster_method = 'ward'):


    """
    Cluster a timeseries dataset based on its relationship to a second timeseries dataset

    Parameters
    ----------
    data1 : array_like
        A matrix of shape (`N`, `M`) with `N1` samples and `M1` dimensions.
        This is the matrix to receive cluster assignment
    data2 : array_like
        A matrix of shape (`N`, `M`) with `N2` samples and `M2` dimensions.
        This is the matrix with which distances will be calculated to assign clusters to data1
    n_clusters : integer
        Number of clusters
    similarity_metric : {'euclidean', 'correlation', 'minkowski', 'cityblock', 'seuclidean'}
        Type of similarity measure for distance matrix.  The pairwise similarity measure
        specifies the edges of the similarity graph. 'data' option assumes X as the similarity
        matrix and hence must be symmetric.  Default is kneighbors_graph [1]_ (forced to be
        symmetric)
    affinity_threshold : float
        Threshold of similarity metric when 'correlation' similarity metric is used.

    Returns
    -------
    y_pred : array_like
        Predicted cluster labels


    Examples
    --------
    np.random.seed(30)
    offset = np.random.randn(30)
    x1 = np.random.randn(200,30) + 2*offset
    x2 = np.random.randn(100,30) + 44*np.random.randn(30)
    x3 = np.random.randn(400,30)
    sampledata1 = np.vstack((x1,x2,x3))

    np.random.seed(99)
    offset = np.random.randn(30)
    x1 = np.random.randn(200,30) + 2*offset
    x2 = np.random.randn(100,30) + 44*np.random.randn(30)
    x3 = np.random.randn(400,30)
    sampledata2 = np.vstack((x1,x2,x3))

    cross_cluster(sampledata1, sampledata2, 3, 'euclidean')


    References
    ----------
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist
    http://scikit-learn.org/stable/modules/clustering.html#spectral-clustering
    """
    
    
    
    import scipy as sp
    import time
    import sklearn as sk
    from sklearn import cluster, datasets, preprocessing
    from sklearn.cluster import FeatureAgglomeration
    from sklearn.feature_extraction import image

    
    
    print("Calculating Cross-clustering")
    print("Calculating pairwise distances between areas")
    
    dist_btwn_data_1_2 = np.array(sp.spatial.distance.cdist(data1.T, data2.T, metric = similarity_metric))
    sim_btwn_data_1_2=1-dist_btwn_data_1_2
    sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)]=0
    sim_btwn_data_1_2[sim_btwn_data_1_2<affinity_threshold]=0

    print("Calculating pairwise distances between voxels in ROI 1 ")
    dist_of_1 = sp.spatial.distance.pdist(sim_btwn_data_1_2, metric = 'euclidean')
    dist_matrix = sp.spatial.distance.squareform(dist_of_1)
    sim_matrix=1-sk.preprocessing.normalize(dist_matrix, norm='max')
    sim_matrix[sim_matrix<affinity_threshold]=0


    if cluster_method == 'ward':
           #    ## BEGIN WARD CLUSTERING CODE 
            print("ward")
            print("ward")
            print("ward")
            print("ward")
            print("ward")
            print("ward")
            print("ward")
            print("ward")
            if roi_mask_nparray!='empty':
                #import pdb; pdb.set_trace()
                shape = roi_mask_nparray.shape
                connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1],
                                                   n_z=shape[2], mask=roi_mask_nparray)
            
                ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity,
                                        linkage='ward')
                ward.fit(sim_matrix)
                y_pred = ward.labels_.astype(np.int)
            else:
                print("Calculating Hierarchical Cross-clustering")
                ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward')    
                ward.fit(sim_matrix)
                y_pred = ward.labels_.astype(np.int)
            
    #    # END WARD CLUSTERING CODE 
    else:
        
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        #cluster_method== 'spectral':
        #Spectral method
        spectral = cluster.SpectralClustering(n_clusters, eigen_solver='arpack', random_state = 5, affinity="precomputed", assign_labels='discretize') 
        spectral.fit(sim_matrix)
        y_pred = spectral.labels_.astype(np.int)     

#    
    # BEGIN SPECTRAL CLUSTERING CODE 
    
    # END SPECTRAL CLUSTERING CODE 



#    sim_matrix[np.isnan((sim_matrix))]=0
#    sim_matrix[sim_matrix<0]=0
#    sim_matrix[sim_matrix>1]=1

    ## BEGIN WARD CLUSTERING CODE 
#    print("Calculating Hierarchical Cross-clustering")
#    ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward')    
#    ward.fit(sim_matrix)
#    y_pred = ward.labels_.astype(np.int)
#    
    ## END WARD CLUSTERING CODE 
    
#    # BEGIN SPECTRAL CLUSTERING CODE 
#    spectral = cluster.SpectralClustering(n_clusters, eigen_solver='arpack', random_state = 5, affinity="precomputed", assign_labels='discretize') 
#    spectral.fit(sim_matrix)
#    y_pred = spectral.labels_.astype(np.int)
#    # END SPECTRAL CLUSTERING CODE 
    
    return y_pred
Exemplo n.º 4
0
def cluster_timeseries(X, roi_mask_nparray, n_clusters, similarity_metric, affinity_threshold, cluster_method = 'ward'):
    """
    Cluster a given timeseries

    Parameters
    ----------
    X : array_like
        A matrix of shape (`N`, `M`) with `N` samples and `M` dimensions
    n_clusters : integer
        Number of clusters
    similarity_metric : {'k_neighbors', 'correlation', 'data'}
        Type of similarity measure for spectral clustering.  The pairwise similarity measure
        specifies the edges of the similarity graph. 'data' option assumes X as the similarity
        matrix and hence must be symmetric.  Default is kneighbors_graph [1]_ (forced to be
        symmetric)
    affinity_threshold : float
        Threshold of similarity metric when 'correlation' similarity metric is used.

    Returns
    -------
    y_pred : array_like
        Predicted cluster labels

    Examples
    --------


    References
    ----------
    .. [1] http://scikit-learn.org/dev/modules/generated/sklearn.neighbors.kneighbors_graph.html


    if similarity_metric == 'correlation':
        # Calculate empirical correlation matrix between samples
        Xn = X - X.mean(1)[:,np.newaxis]
        Xn = Xn/np.sqrt( (Xn**2.).sum(1)[:,np.newaxis] )
        C_X = np.dot(Xn, Xn.T)
        C_X[C_X < affinity_threshold] = 0
        from scipy.sparse import lil_matrix
        C_X = lil_matrix(C_X)
    elif similarity_metric == 'data':
        C_X = X
    elif similarity_metric == 'k_neighbors':
        from sklearn.neighbors import kneighbors_graph
        C_X = kneighbors_graph(X, n_neighbors=neighbors)
        C_X = 0.5 * (C_X + C_X.T)
    else:
        raise ValueError("Unknown value for similarity_metric: '%s'." % similarity_metric)

    #sklearn code is not stable for bad clusters which using correlation as a stability metric
    #tends to give for more info see:
    #http://scikit-learn.org/dev/modules/clustering.html#spectral-clustering warning
    #from sklearn import cluster
    #algorithm = cluster.SpectralClustering(k=n_clusters, mode='arpack')
    #algorithm.fit(C_X)
    #y_pred = algorithm.labels_.astype(np.int)

    from python_ncut_lib import ncut, discretisation
    eigen_val, eigen_vec = ncut(C_X, n_clusters)
    eigen_discrete = discretisation(eigen_vec)

    #np.arange(n_clusters)+1 isn't really necessary since the first cluster can be determined
    #by the fact that the each cluster is a disjoint set
    y_pred = np.dot(eigen_discrete.toarray(), np.diag(np.arange(n_clusters))).sum(1)

    """
    
    import sklearn as sk
    from sklearn import cluster, datasets, preprocessing
    import scipy as sp
    import time 
    from sklearn.cluster import FeatureAgglomeration
    from sklearn.feature_extraction import image

    
    print('Beginning Calculating pairwise distances between voxels')
      
    X = np.array(X)
    X_dist = sp.spatial.distance.pdist(X.T, metric = similarity_metric)
    
    temp=X_dist
    temp[np.isnan(temp)]=0
    tempmax=temp.max()
    
    X_dist = sp.spatial.distance.squareform(X_dist)
    X_dist[np.isnan(X_dist)]=tempmax
    #import pdb;pdb.set_trace()
    sim_matrix=1-sk.preprocessing.normalize(X_dist, norm='max')
    sim_matrix[sim_matrix<affinity_threshold]=0
    #import pdb;pdb.set_trace()
    if cluster_method == 'ward':
       #    ## BEGIN WARD CLUSTERING CODE 
        print("ward")
        print("ward")
        print("ward")
        print("ward")
        print("ward")
        print("ward")
        print("ward")
        print("ward")
        if roi_mask_nparray!='empty':
            #import pdb; pdb.set_trace()
            shape = roi_mask_nparray.shape
            connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1],
                                               n_z=shape[2], mask=roi_mask_nparray)
        
            ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity,
                                    linkage='ward')
            ward.fit(sim_matrix)
            y_pred = ward.labels_.astype(np.int)
        else:
            print("Calculating Hierarchical Clustering")
            ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward')    
            ward.fit(sim_matrix)
            y_pred = ward.labels_.astype(np.int)
        
#    # END WARD CLUSTERING CODE 
    else:
        
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        print("spectral")
        #cluster_method== 'spectral':
        #Spectral method
        spectral = cluster.SpectralClustering(n_clusters, eigen_solver='arpack', random_state = 5, affinity="precomputed", assign_labels='discretize') 
        spectral.fit(sim_matrix)
        y_pred = spectral.labels_.astype(np.int)     

#    
    # BEGIN SPECTRAL CLUSTERING CODE 
    
    # END SPECTRAL CLUSTERING CODE 



    return y_pred
Exemplo n.º 5
0
def cross_cluster_timeseries(data1,
                             data2,
                             roi_mask_data,
                             n_clusters,
                             similarity_metric,
                             affinity_threshold,
                             cluster_method='ward',
                             random_state=None):
    """
    Cluster a timeseries dataset based on its relationship
    to a second timeseries dataset

    Parameters
    ----------
    data1 : array_like
        A matrix of shape (`N`, `M`) with `N1` samples and `M1` dimensions.
        This is the matrix to receive cluster assignment
    data2 : array_like
        A matrix of shape (`N`, `M`) with `N2` samples and `M2` dimensions.
        This is the matrix with which distances will be calculated to assign
        clusters to data1
    roi_mask_data : array_like
        An array that contains a binary mask of the region of interest (ROI)
        being parcellated.
    n_clusters : integer
        Number of clusters
    similarity_metric : {'euclidean', 'correlation', 'minkowski', 'cityblock',
                         'seuclidean'}
        Type of similarity measure for distance matrix.  The pairwise similarity
        measure specifies the edges of the similarity graph. 'data' option
        assumes X as the similarity matrix and hence must be symmetric.
        Default is kneighbors_graph [1]_ (forced to be symmetric)
    affinity_threshold : float
        Threshold of similarity metric when 'correlation' similarity metric
        is used.
    cluster_method : {'ward', 'spectral', 'kmeans', 'gaussianmixture'}
        A string that says which cluster method to use.
    random_state : integer
        the random state to seed the bootstrap

    Returns
    -------
    y_pred : array_like
        Predicted cluster labels


    Examples
    --------
    np.random.seed(30)
    offset = np.random.randn(30)
    x1 = np.random.randn(200, 30) + 2 * offset
    x2 = np.random.randn(100, 30) + 44 * np.random.randn(30)
    x3 = np.random.randn(400, 30)
    sampledata1 = np.vstack((x1, x2, x3))

    np.random.seed(99)
    offset = np.random.randn(30)
    x1 = np.random.randn(200, 30) + 2 * offset
    x2 = np.random.randn(100, 30) + 44 * np.random.randn(30)
    x3 = np.random.randn(400, 30)
    sampledata2 = np.vstack((x1, x2, x3))

    cross_cluster(sampledata1, sampledata2, 3, 'euclidean')


    References
    ----------
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist
    http://scikit-learn.org/stable/modules/clustering.html#spectral-clustering

    """

    from scipy.spatial.distance import pdist, cdist, squareform
    from sklearn.preprocessing import normalize
    from sklearn.feature_extraction import image
    from sklearn.cluster import FeatureAgglomeration, KMeans, SpectralClustering
    from sklearn.mixture import GaussianMixture

    dist_btwn_data_1_2 = np.array(
        cdist(data1.T, data2.T, metric=similarity_metric))

    max_dist = np.nanmax(dist_btwn_data_1_2)
    dist_btwn_data_1_2[np.isnan(dist_btwn_data_1_2)] = max_dist

    dist_of_1 = pdist(dist_btwn_data_1_2, metric='euclidean')
    dist_matrix = squareform(dist_of_1)
    sim_matrix = 1 - normalize(dist_matrix, norm='max')
    sim_matrix[sim_matrix < affinity_threshold] = 0

    if cluster_method == 'ward':

        if roi_mask_data is not None:
            shape = roi_mask_data.shape
            connectivity = image.grid_to_graph(n_x=shape[0],
                                               n_y=shape[1],
                                               n_z=shape[2],
                                               mask=roi_mask_data)

            ward = FeatureAgglomeration(n_clusters=n_clusters,
                                        connectivity=connectivity,
                                        linkage='ward')
            ward.fit(sim_matrix)
            y_pred = ward.labels_.astype(np.int)

        else:
            ward = FeatureAgglomeration(n_clusters=n_clusters,
                                        affinity='euclidean',
                                        linkage='ward')
            ward.fit(sim_matrix)
            y_pred = ward.labels_.astype(np.int)

    elif cluster_method == 'spectral':
        spectral = SpectralClustering(n_clusters,
                                      eigen_solver='arpack',
                                      affinity="precomputed",
                                      assign_labels='discretize',
                                      random_state=random_state)
        spectral.fit(sim_matrix)
        y_pred = spectral.labels_.astype(np.int)

    elif cluster_method == 'kmeans':
        kmeans = KMeans(n_clusters=n_clusters,
                        init='k-means++',
                        n_init=10,
                        random_state=random_state)

        kmeans.fit(sim_matrix)
        y_pred = kmeans.labels_.astype(np.int)

    elif cluster_method == 'gaussianmixture':
        gaussianmixture = GaussianMixture(n_components=n_clusters,
                                          init_params='kmeans',
                                          random_state=random_state)
        y_pred = gaussianmixture.fit_predict(sim_matrix)

    return y_pred
Exemplo n.º 6
0
def cluster_timeseries(X,
                       roi_mask_data,
                       n_clusters,
                       similarity_metric,
                       affinity_threshold,
                       cluster_method='ward',
                       random_state=None):
    """
    Cluster a given timeseries

    Parameters
    ----------
    X : array_like
        A matrix of shape (`N`, `M`) with `N` samples and `M` dimensions
    roi_mask_data : array_like
        An array that contains a binary mask of the region of interest (ROI)
        being parcellated.
    n_clusters : integer
        Number of clusters
    similarity_metric : {'k_neighbors', 'correlation', 'data'}
        Type of similarity measure for spectral clustering. The pairwise
        similarity measure specifies the edges of the similarity graph.
        'data' option assumes X as the similarity matrix and hence must be
        symmetric.  Default is kneighbors_graph [1]_ (forced to be symmetric)
    affinity_threshold : float
        Threshold of similarity metric when 'correlation' similarity
        metric is used.
    cluster_method : {'ward', 'spectral', 'kmeans', 'gaussianmixture'}
        A string that says which cluster method to use.
    random_state : integer
        the random state to seed the bootstrap


    Returns
    -------
    y_pred : array_like
        Predicted cluster labels

    Examples
    --------


    References
    ----------
    .. [1] http://scikit-learn.org/dev/modules/generated/sklearn.neighbors.kneighbors_graph.html

    """
    import numpy as np
    import scipy as sp
    import sklearn as sk
    from sklearn.feature_extraction import image
    from sklearn.cluster import FeatureAgglomeration, SpectralClustering, KMeans
    from sklearn.mixture import GaussianMixture

    X = np.array(X)
    X_dist = sp.spatial.distance.pdist(X.T, metric=similarity_metric)
    max_dist = np.nanmax(X_dist)

    X_dist = sp.spatial.distance.squareform(X_dist)
    X_dist[np.isnan(X_dist)] = max_dist

    sim_matrix = 1 - sk.preprocessing.normalize(X_dist, norm='max')
    sim_matrix[sim_matrix < affinity_threshold] = 0

    print("Calculating Hierarchical Clustering")

    cluster_method = cluster_method.lower()

    if cluster_method == 'ward':
        if roi_mask_data is not None:
            shape = roi_mask_data.shape
            connectivity = image.grid_to_graph(n_x=shape[0],
                                               n_y=shape[1],
                                               n_z=shape[2],
                                               mask=roi_mask_data)

            ward = FeatureAgglomeration(n_clusters=n_clusters,
                                        connectivity=connectivity,
                                        linkage='ward')
            ward.fit(sim_matrix)
            y_pred = ward.labels_.astype(np.int)

        else:
            ward = FeatureAgglomeration(n_clusters=n_clusters,
                                        affinity='euclidean',
                                        linkage='ward')

            ward.fit(sim_matrix)
            y_pred = ward.labels_.astype(np.int)

    elif cluster_method == 'spectral':
        spectral = SpectralClustering(n_clusters,
                                      eigen_solver='arpack',
                                      affinity="precomputed",
                                      assign_labels='discretize',
                                      random_state=random_state)
        spectral.fit(sim_matrix)
        y_pred = spectral.labels_.astype(np.int)

    elif cluster_method == 'kmeans':
        kmeans = KMeans(n_clusters=n_clusters,
                        init='k-means++',
                        n_init=10,
                        random_state=random_state)
        kmeans.fit(sim_matrix)
        y_pred = kmeans.labels_.astype(np.int)

    elif cluster_method == 'gaussianmixture':
        gaussianmixture = GaussianMixture(n_components=n_clusters,
                                          init_params='kmeans',
                                          random_state=random_state)
        y_pred = gaussianmixture.fit_predict(sim_matrix)

    return y_pred