예제 #1
0
파일: cluster.py 프로젝트: baothien/tiensy
def view_cluster_number(input_polydata, cluster_number, cluster_indices=None):

    """ Pop up a render window showing the selected cluster.


    Uses cluster_indices to choose corresponding cells in the
    polydata. If no argument cluster_indices is provided, then uses
    the cell data array named ClusterNumber.  One of these inputs must
    be present.

    """

    if cluster_indices == None:
        cluster_indices_vtk = \
            input_polydata.GetCellData().GetArray('ClusterNumber')
        cluster_indices = numpy.zeros(cluster_indices_vtk.GetNumberOfTuples())
        for fidx in range(0, cluster_indices_vtk.GetNumberOfTuples()):
            cluster_indices[fidx] = cluster_indices_vtk.GetTuple(fidx)[0]

    fiber_mask = cluster_indices == cluster_number
    view_polydata = filter.mask(input_polydata, fiber_mask)
    ren = render.render(view_polydata)

    return ren
예제 #2
0
파일: cluster.py 프로젝트: baothien/tiensy
def spectral(input_polydata, number_of_clusters=300,
             number_of_eigenvectors=10, sigma=20, threshold=2,
             number_of_jobs=3, use_nystrom=False, nystrom_mask = None,
             landmarks=None):

    """ Spectral clustering based on pairwise fiber affinity matrix.

    As in O'Donnell and Westin TMI 2007.

    Differences from that implementation: fiber distance is defined
    using fixed-length fiber parameterization.

    """

    # test pd has lines first
    number_fibers = input_polydata.GetNumberOfLines()
    print "<cluster.py> Starting spectral clustering."
    print "<cluster.py> Number of input fibers:", number_fibers
    print "<cluster.py> Number of clusters:", number_of_clusters

    if number_fibers == 0:
        print "<cluster.py> ERROR: Cannot cluster polydata with 0 fibers."
        return

    atlas = ClusterAtlas() 

    # 1) Compute fiber similarities.
    # Nystrom version of the code uses a sample of the data.
    if use_nystrom:
        # make sure it's an array for logic operations
        nystrom_mask = numpy.array(nystrom_mask)
        # make sure it's boolean or 0 and 1
        test = numpy.max(nystrom_mask) == 1.0
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask is may not be Boolean. Max value is not 1.0/True."
            raise AssertionError
        # make sure it's large enough
        test = sum(nystrom_mask) >= 100
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask is smaller than 100."
            raise AssertionError
        # make sure its size matches the polydata input
        test = len(nystrom_mask) == number_fibers
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask size does not match polydata number of lines."
            raise AssertionError
        
        # Separate the Nystrom sample and the rest of the data.
        polydata_m = filter.mask(input_polydata, nystrom_mask)
        atlas.nystrom_polydata = polydata_m
        atlas.threshold = threshold
        polydata_n = filter.mask(input_polydata, nystrom_mask == False)
        sz = polydata_m.GetNumberOfLines()
        print '<cluster.py> Using Nystrom approximation. Subset size:',  sz, '/', number_fibers
        # Determine ordering to get embedding to correspond to original input data.
        reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask)[0], numpy.where(~nystrom_mask)[0]))
        if landmarks is not None:
            landmarks_m = landmarks[nystrom_mask,:,:]
            landmarks_n = landmarks[~nystrom_mask,:,:]
        else:
            landmarks_m = landmarks_n = None

        # Calculate fiber similarities
        A = \
            _pairwise_similarity_matrix(polydata_m, threshold,
                                        sigma, number_of_jobs, landmarks_m)
        B = \
            _rectangular_similarity_matrix(polydata_n, polydata_m, threshold,
                                           sigma, number_of_jobs, landmarks_n, landmarks_m)
        atlas.sigma = sigma
    else:
        # Calculate all fiber similarities
        A = \
            _pairwise_similarity_matrix(input_polydata, threshold,
                                    sigma, number_of_jobs, landmarks)

    # 2) Do Normalized Cuts transform of similarity matrix.
    # See the paper: "Spectral Grouping Using the Nystrom Method"
    # (D^-1/2 W D^-1/2) V = V Lambda
    if use_nystrom:
        # calculate the sum of the rows we know from the full matrix
        atlas.row_sum_1 = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0)        
        # approximate the sum of the columns
        # weights for approximating row sum use the known columns of B'
        # pseudo inverse of A is needed for atlas (?)
        atlas.pinv_A = numpy.linalg.pinv(A)  

        e_val, e_vec = numpy.linalg.eigh(atlas.pinv_A)
        print "test of non-normalized A pseudoinverse Eigenvalue range:", e_val[0], e_val[-1]  

        # matlab was: atlas.approxRowSumMatrix = sum(B',1)*atlas.pseudoInverseA;
        # this matrix is needed for atlas.
        atlas.row_sum_matrix = numpy.dot(numpy.sum(B.T, axis=0), atlas.pinv_A)
        # row sum estimate for current B part of the matrix
        row_sum_2 = numpy.sum(B, axis=0) + \
            numpy.dot(atlas.row_sum_matrix, B)
        print "row sum check", numpy.min(atlas.row_sum_1), \
            numpy.max(atlas.row_sum_1), numpy.min(row_sum_2), \
            numpy.max(row_sum_2)
        print "check 2:", numpy.min(numpy.sum(B.T, axis=0)), numpy.min(atlas.row_sum_matrix)

        # normalized cuts normalization
        dhat = numpy.sqrt(numpy.divide(1, numpy.concatenate((atlas.row_sum_1, row_sum_2))))

        A = \
            numpy.multiply(A, numpy.outer(dhat[0:sz], dhat[0:sz].T))
        B = \
            numpy.multiply(B, numpy.outer(dhat[0:sz], dhat[sz:].T))
    else:
        # normalized cuts normalization using row (same as column) sums
        row_sum = numpy.sum(A, axis=0)
        dhat = numpy.divide(1, numpy.sqrt(row_sum))
        A = \
            numpy.multiply(A, numpy.outer(dhat, dhat.T))

    # 3) Compute eigenvectors for use in spectral embedding
    print '<cluster.py> Calculating eigenvectors of similarity matrix A...'
    atlas.e_val, atlas.e_vec = numpy.linalg.eigh(A)
    print '<cluster.py> Done calculating eigenvectors.'
    print "<cluster.py> Eigenvalue range:", atlas.e_val[0], atlas.e_val[-1]    
    # Check how well our chosen number of eigenvectors models the data
    power = numpy.cumsum(atlas.e_val[::-1]) / numpy.sum(atlas.e_val)
    print "<cluster.py> Power from chosen number of eigenvectors (", number_of_eigenvectors, ')', power[number_of_eigenvectors]
    print '<cluster.py> Top eigenvalues:', atlas.e_val[::-1][1:number_of_eigenvectors]

    # 4) Compute embedding using eigenvectors
    print('<cluster.py> Compute embedding using eigenvectors.')
    if use_nystrom:
        # Create embedding vectors using nystrom approximation to find
        # the approximate top eigenvectors of the matrix
        # L = D^(-1/2) (D - W) D^(-1/2) 
        # See the paper:
        # "Spectral Grouping Using the Nystrom Method"
        # Basically all this does is adds in the extra measurements
        # by projecting them onto the original eigenvector basis.
        # A=UVU' => U = AUV^-1 => take new rows of extended A (B') and do
        # the same.  U' = [AUV^-1 ; B'UV^-1] = [U ; B'UV^-1]
        # Note they divide embedding by 1st eigenvector rather
        # than sqrt of row sum, as in this code (below).

        # matlab was: % project onto eigenvectors of A:
        # % v' = [v ; B'*v*d^-1 
        # V = [atlas.eigenvectA; B'*atlas.eigenvectA*(diag(1./diag(atlas.eigenvalA)))];
        V = numpy.concatenate((atlas.e_vec, \
                                  numpy.dot(numpy.dot(B.T, atlas.e_vec), \
                                                numpy.diag(numpy.divide(1.0, atlas.e_val)))))

        # normalize estimated eigenvectors to have length of one
        # matlab was:
        # atlas.eigenvectorLengthToNormalize=sqrt(sum(V.*V));
        # V=V./repmat(atlas.eigenvectorLengthToNormalize,length(V),1);
        atlas.e_vec_norm = numpy.sum(numpy.multiply(V, V),0)
        V = numpy.divide(V, atlas.e_vec_norm)

        # Normalize each embedding vector by first eigenvector. Matlab code was:
        # for i = 2:embedLength+1
        #    embedding(:,i-1) = V(:,i)./V(:,1);
        # end
        # This eigenvector corresponds to an eigenvalue of 1, since row sums are 1.
        # The other option from the literature was to use this:
        # embedding_i,j = V_i+i,j./sqrt(D_j,j)
        embed = numpy.zeros((number_fibers, number_of_eigenvectors))
        for i in range(0, number_of_eigenvectors):
            embed[reorder_embedding,i] = numpy.divide(V[:,-(i+2)], V[:,-1])
    else:
        embed = atlas.e_vec[:, -number_of_eigenvectors - 2: -2]
        embed = numpy.divide(embed.T, atlas.e_vec[:, -1]).T
        # reverse order of embedding so highest eigenvalue
        # information is first
        embed = embed[:, ::-1]

    atlas.number_of_eigenvectors = number_of_eigenvectors

    # 5) Find clusters using k-means in embedding space.
    print '<cluster.py> K-means clustering in embedding space.'
    atlas.centroids, distortion = scipy.cluster.vq.kmeans(embed, number_of_clusters)
    cluster_idx, dist = scipy.cluster.vq.vq(embed, atlas.centroids)

    # 6) Output results.
    print '<cluster.py> Done spectral clustering, returning results.'
    # visualize embedding coordinates as RGB
    embed2 = embed
    embed2[numpy.isnan(embed)] = 0.0
    color = _embed_to_rgb(embed2)
    # set up polydata with clustering output info.
    # for now modify input polydata by adding two arrays
    output_polydata = input_polydata
    output_polydata = \
        _format_output_polydata(output_polydata, cluster_idx, color, embed)

    return output_polydata, cluster_idx, color, embed, distortion, atlas
예제 #3
0
def spectral(input_polydata, number_of_clusters=200,
             number_of_eigenvectors=20, sigma=60, threshold=0.0,
             number_of_jobs=3, use_nystrom=False, nystrom_mask=None,
             landmarks=None, distance_method='Mean', normalized_cuts=True, bilateral=False):

    """ Spectral clustering based on pairwise fiber affinity matrix.

    As in O'Donnell and Westin TMI 2007.

    Differences from that implementation: fiber point correspondences are defined
    using fixed-length fiber parameterization (instead of closest point).

    """

    # test pd has lines first
    number_fibers = input_polydata.GetNumberOfLines()
    print "<cluster.py> Starting spectral clustering."
    print "<cluster.py> Number of input fibers:", number_fibers
    print "<cluster.py> Number of clusters:", number_of_clusters

    if number_fibers == 0:
        print "<cluster.py> ERROR: Cannot cluster polydata with 0 fibers."
        return

    atlas = ClusterAtlas() 

    # Store all parameters to this function. They must be identical later to label new data.
    # Below, calculated values will also be stored in the atlas.
    atlas.number_of_eigenvectors = number_of_eigenvectors
    atlas.sigma = sigma
    atlas.threshold = threshold
    atlas.use_nystrom = use_nystrom
    atlas.landmarks = landmarks
    atlas.distance_method = distance_method
    atlas.bilateral = bilateral

    # 1) Compute fiber similarities.
    # Nystrom version of the code uses a sample of the data.
    if use_nystrom:
        # make sure it's an array for logic operations
        nystrom_mask = numpy.array(nystrom_mask)
        # make sure it's boolean or 0 and 1
        test = numpy.max(nystrom_mask) == 1.0
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask is may not be Boolean. Max value is not 1.0/True."
            raise AssertionError
        # make sure it's large enough
        test = sum(nystrom_mask) >= 100
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask is smaller than 100."
            raise AssertionError
        # make sure its size matches the polydata input
        test = len(nystrom_mask) == number_fibers
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask size does not match polydata number of lines."
            raise AssertionError
        
        # Separate the Nystrom sample and the rest of the data.
        polydata_m = filter.mask(input_polydata, nystrom_mask)
        atlas.nystrom_polydata = polydata_m
        polydata_n = filter.mask(input_polydata, nystrom_mask == False)
        sz = polydata_m.GetNumberOfLines()
        print '<cluster.py> Using Nystrom approximation. Subset size:',  sz, '/', number_fibers
        # Determine ordering to get embedding to correspond to original input data.
        reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask)[0], numpy.where(~nystrom_mask)[0]))
        if landmarks is not None:
            landmarks_m = landmarks[nystrom_mask,:,:]
            landmarks_n = landmarks[~nystrom_mask,:,:]
        else:
            landmarks_m = landmarks_n = None

        # Calculate fiber similarities
        A = \
            _pairwise_similarity_matrix(polydata_m, threshold,
                                        sigma, number_of_jobs, landmarks_m, distance_method, bilateral)
        B = \
            _rectangular_similarity_matrix(polydata_n, polydata_m, threshold,
                                           sigma, number_of_jobs, landmarks_n, landmarks_m, distance_method, bilateral)

        # sanity check
        print "Range of values in A:", numpy.min(A), numpy.max(A)
        print "Range of values in B:", numpy.min(B), numpy.max(B)
        
    else:
        # Calculate all fiber similarities
        A = \
            _pairwise_similarity_matrix(input_polydata, threshold,
                                    sigma, number_of_jobs, landmarks, distance_method, bilateral)

        atlas.nystrom_polydata = input_polydata
        # sanity check
        print "Range of values in A:", numpy.min(A), numpy.max(A)
        
    testval = numpy.max(A-A.T) 
    if not testval == 0.0:
        if testval > 1e-10:
            print "<cluster.py> ERROR: A matrix is not symmetric."
            raise AssertionError
        else:
            print "Maximum of A - A^T:", testval
            A = numpy.divide(A+A.T, 2.0)
        
    testval = numpy.min(A)
    if not testval > 0.0:
        print "<cluster.py> ERROR: A matrix is not positive."
        print "Minimum value in A: ", testval
        if testval < 0.0:
            raise AssertionError
   
    # 2) Do Normalized Cuts transform of similarity matrix.
    # See the paper: "Spectral Grouping Using the Nystrom Method"
    # (D^-1/2 W D^-1/2) V = V Lambda
    if normalized_cuts:
        if use_nystrom:
            # Form of entire affinity matrix: 
            # A   B
            # B^T   C
            # C is not computed.
            # Calculate the sum of the partial rows we've computed:
            atlas.row_sum_1 = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0)  
            print "A size:", A.shape
            print "B size:", B.shape
            print "row sum size:", atlas.row_sum_1.shape
            test = atlas.row_sum_1
            print "A-B matrix row sums range (should be > 0):", numpy.min(atlas.row_sum_1), numpy.max(atlas.row_sum_1)
            
            # Approximate the sum of the rest of the data (including C)
            # These are weighted sums of the columns we did compute
            # where the weight depends on how similar that fiber 
            # was to each path in A.  This uses the dual basis
            # of the columns in A.
            # Approximate the inverse of A for dual basis

            
            # Use A's top eigenvectors (must have pos eigenvalues)
            # Construct an approximate inverse using the largest eigenvalues of A.
            pos_def_approx = True
            if pos_def_approx:
                print "Using A's top eigenvectors in pinv"
                numA = len(A)
                nvec = 40
                if nvec > numA / 2.0:
                    nvec = numpy.round(numA / 2)
                if nvec < number_of_eigenvectors + 1:
                    nvec = number_of_eigenvectors + 1

                val, vec = numpy.linalg.eigh(A)
                # numpy.dot(numpy.dot(vec,numpy.diag(val)),vec.T)
                ind = numpy.argsort(val)
                mask = ind[-nvec:-1]
                vec2 = vec[:,mask]
                val2 = val[mask]
                #A2 = numpy.dot(numpy.dot(vec2,numpy.diag(val2)),vec2.T)
                atlas.pinv_A = numpy.dot(numpy.dot(vec2,numpy.diag(numpy.divide(1.0,val2))),vec2.T)
            else:
                print "Using numpy linalg pinv A"
                atlas.pinv_A = numpy.linalg.pinv(A)
                
            e_val, e_vec = numpy.linalg.eigh(atlas.pinv_A)
            print "test of non-normalized A pseudoinverse Eigenvalue range:", e_val[0], e_val[-1]  
            e_val, e_vec = numpy.linalg.eigh(A)
            print "Was A positive definite? Eigenvalue range of A:", e_val[0], e_val[-1]  


            # row sum formula:
            # dhat = [a_r + b_r; b_c + B^T*A-1*b_r]
            # this matrix is A^-1 * b_r, where b_r are the row sums of B
            # matlab was: atlas.approxRowSumMatrix = sum(B',1)*atlas.pseudoInverseA;
            atlas.row_sum_matrix = numpy.dot(numpy.sum(B.T, axis=0), atlas.pinv_A)
            test = numpy.sum(B.T, axis=0)
            print "B column sums range (should be > 0):", numpy.min(test), numpy.max(test)
            print "Range of row sum weights:", numpy.min(atlas.row_sum_matrix), numpy.max(atlas.row_sum_matrix)
            print "First 10 entries in weight matrix:", atlas.row_sum_matrix[0:10]
            test = numpy.dot(atlas.row_sum_matrix, B)
            print "Test partial sum estimation for B:", numpy.min(test), numpy.max(test)
            
            # row sum estimate for current B part of the matrix
            row_sum_2 = numpy.sum(B, axis=0) + \
                numpy.dot(atlas.row_sum_matrix, B)
            print "Row sum check (min/max, should be > 0) A:", numpy.min(atlas.row_sum_1), \
                numpy.max(atlas.row_sum_1),  "B:", numpy.min(row_sum_2), \
                numpy.max(row_sum_2)

            print atlas.row_sum_1.shape
            print row_sum_2.shape
            
            # normalized cuts normalization
            dhat = numpy.sqrt(numpy.divide(1, numpy.concatenate((atlas.row_sum_1, row_sum_2))))

            A = \
                numpy.multiply(A, numpy.outer(dhat[0:sz], dhat[0:sz].T))
            B = \
                numpy.multiply(B, numpy.outer(dhat[0:sz], dhat[sz:].T))
        else:
            # normalized cuts normalization using row (same as column) sums
            row_sum = numpy.sum(A, axis=0)
            dhat = numpy.divide(1, numpy.sqrt(row_sum))
            A = \
                numpy.multiply(A, numpy.outer(dhat, dhat.T))

    # 3) Compute eigenvectors for use in spectral embedding
    print '<cluster.py> Calculating eigenvectors of similarity matrix A...'
    atlas.e_val, atlas.e_vec = numpy.linalg.eigh(A)
    print '<cluster.py> Done calculating eigenvectors.'
    print "<cluster.py> Eigenvalue range:", atlas.e_val[0], atlas.e_val[-1]    
    # Check how well our chosen number of eigenvectors models the data
    power = numpy.cumsum(atlas.e_val[::-1]) / numpy.sum(atlas.e_val)
    print "<cluster.py> Power from chosen number of eigenvectors (", number_of_eigenvectors, ')', power[number_of_eigenvectors]
    print '<cluster.py> Top eigenvalues:', atlas.e_val[::-1][1:number_of_eigenvectors]

    # 4) Compute embedding using eigenvectors
    print('<cluster.py> Compute embedding using eigenvectors.')
    if use_nystrom:
        # Create embedding vectors using nystrom approximation to find
        # the approximate top eigenvectors of the matrix
        # L = D^(-1/2) (D - W) D^(-1/2) 
        # See the paper:
        # "Spectral Grouping Using the Nystrom Method"
        # Basically all this does is adds in the extra measurements
        # by projecting them onto the original eigenvector basis.
        # A=UVU' => U = AUV^-1 => take new rows of extended A (B') and do
        # the same.  U' = [AUV^-1 ; B'UV^-1] = [U ; B'UV^-1]
        # Note they divide embedding by 1st eigenvector rather
        # than sqrt of row sum, as in this code (below).

        # matlab was: % project onto eigenvectors of A:
        # % v' = [v ; B'*v*d^-1 
        # V = [atlas.eigenvectA; B'*atlas.eigenvectA*(diag(1./diag(atlas.eigenvalA)))];
        V = numpy.concatenate((atlas.e_vec, \
                                  numpy.dot(numpy.dot(B.T, atlas.e_vec), \
                                                numpy.diag(numpy.divide(1.0, atlas.e_val)))))

        # normalize estimated eigenvectors to have length of one
        # matlab was:
        # atlas.eigenvectorLengthToNormalize=sqrt(sum(V.*V));
        # V=V./repmat(atlas.eigenvectorLengthToNormalize,length(V),1);
        atlas.e_vec_norm = numpy.sum(numpy.multiply(V, V),0)
        V = numpy.divide(V, atlas.e_vec_norm)

        # Normalize each embedding vector by first eigenvector. Matlab code was:
        # for i = 2:embedLength+1
        #    embedding(:,i-1) = V(:,i)./V(:,1);
        # end
        # This eigenvector corresponds to an eigenvalue of 1, since row sums are 1.
        # The other option from the literature was to use this:
        # embedding_i,j = V_i+i,j./sqrt(D_j,j)
        embed = numpy.zeros((number_fibers, number_of_eigenvectors))
        for i in range(0, number_of_eigenvectors):
            embed[reorder_embedding,i] = numpy.divide(V[:,-(i+2)], V[:,-1])
    else:
        embed = atlas.e_vec[:, -number_of_eigenvectors - 2: -2]
        embed = numpy.divide(embed.T, atlas.e_vec[:, -1]).T
        # reverse order of embedding so highest eigenvalue
        # information is first
        embed = embed[:, ::-1]


    # Default is always k-means. Other code is just left for testing. Did not improve results.
    #centroid_finder = 'AffinityPropagation'
    centroid_finder = 'K-means'
    
    # 5) Find clusters using k-means in embedding space.
    cluster_metric = None
    if centroid_finder == 'K-means':
        print '<cluster.py> K-means clustering in embedding space.'
        atlas.centroids, cluster_metric = scipy.cluster.vq.kmeans(embed, number_of_clusters)
        cluster_idx, dist = scipy.cluster.vq.vq(embed, atlas.centroids)
        print "Distortion metric:", cluster_metric
        if 0:
            # This is extremely slow, but leave code here if ever wanted for testing
            cluster_metric = metrics.silhouette_score(embed, cluster_idx, metric='sqeuclidean')
            print("Silhouette Coefficient: %0.3f" % cluster_metric)
 
    else:
        # This found fewer clusters than we need to represent the anatomy well
        # Leave code here in case wanted in future for more testing.
        print '<cluster.py> Affinity Propagation clustering in embedding space.'
        af = AffinityPropagation(preference=-50).fit(embed)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        n_clusters_ = len(cluster_centers_indices)
        print('Estimated number of clusters: %d' % n_clusters_)
        cluster_idx = labels
        for k in range(n_clusters_):
            class_members = labels == k
            atlas.centroids = embed[cluster_centers_indices[k]]
        # return metrics
        if 0:
            # This is extremely slow, but leave code here if ever wanted for testing
            cluster_metric = metrics.silhouette_score(embed, labels, metric='sqeuclidean')
            print("Silhouette Coefficient: %0.3f" % cluster_metric)
        
    # 6) Output results.
    print '<cluster.py> Done spectral clustering, returning results.'
    # visualize embedding coordinates as RGB
    embed2 = embed
    #embed2[numpy.isnan(embed)] = 0.0
    color = _embed_to_rgb(embed2)
    # set up polydata with clustering output info.
    # for now modify input polydata by adding two arrays
    output_polydata = input_polydata
    output_polydata = \
        _format_output_polydata(output_polydata, cluster_idx, color, embed)

    return output_polydata, cluster_idx, color, embed, cluster_metric, atlas
예제 #4
0
    def compute(self, input_vtk_polydata):
        """ Actually calculate the laterality index for every input
        fiber.

        Input polydata is required. This polydata is modified by
        adding a cell data array containing laterality indices.

        Output from this class is a struct: <io.py> class
        LateralityResults

        Parameters in the class can also be modified for experiments:
        sigma (in Gaussian on inter-fiber-point distance),
        points_per_fiber (for parameterization), threshold (below
        which inter-fiber-point distance is set to 0).

        Performance options are the number of parallel_jobs, and
        verbose (whether to print progress).

        """

        # internal representation for fast similarity computation
        # this also detects which hemisphere fibers are in
        self.fibers.points_per_fiber = self.points_per_fiber
        # must request hemisphere computation from object
        self.fibers.hemispheres = True
        # Now convert to array with points and hemispheres as above
        self.fibers.convert_from_polydata(input_vtk_polydata)

        # get the same number from each hemisphere if requested
        # -------------------------
        if self.use_equal_fibers:
            num_fibers = min(self.fibers.number_left_hem,
                             self.fibers.number_right_hem)
            if self.fibers_per_hemisphere is not None:
                if self.fibers_per_hemisphere <= num_fibers:
                    num_fibers = self.fibers_per_hemisphere
                else:
                    raise Exception(
                        "Fibers per hemisphere is set too high for the dataset. Current subject maximum is"
                        + str(num_fibers))

            # grab num_fibers fibers from each hemisphere.
            # use the first n since they were randomly sampled from the whole dataset
            selected_right = self.fibers.index_right_hem[0:num_fibers]
            selected_left = self.fibers.index_left_hem[0:num_fibers]
            mask = numpy.zeros(input_vtk_polydata.GetNumberOfLines())
            mask[selected_right] = 1
            mask[selected_left] = 1
            # go back to the input data and use just those fibers
            input_vtk_polydata = filter.mask(input_vtk_polydata, mask)
            # Now convert to array with points and hemispheres as above
            self.fibers.convert_from_polydata(input_vtk_polydata)
            if self.verbose:
                print "<laterality.py> Using ", num_fibers, " fibers per hemisphere."

        # square sigma for later Gaussian
        sigmasq = self.sigma * self.sigma

        # allocate outputs
        nf = self.fibers.number_of_fibers
        laterality_index = numpy.zeros(nf)
        right_hem_total = numpy.zeros(nf)
        left_hem_total = numpy.zeros(nf)
        #right_hem_distance = numpy.zeros([nf, nf])
        #left_hem_distance = numpy.zeros([nf, nf])

        # grab all fibers from each hemisphere
        fiber_array_right = self.fibers.get_fibers(self.fibers.index_right_hem)
        fiber_array_left = self.fibers.get_fibers(self.fibers.index_left_hem)

        # tell user we are doing something
        if self.verbose:
            print "<laterality.py> Fibers in each hemisphere.", \
                "L:", self.fibers.number_left_hem, \
                "R:", self.fibers.number_right_hem, \
                "/ Total:", self.fibers.number_of_fibers
            print "<laterality.py> Starting to compute laterality indices"

        # run the computation, either in parallel or not
        if (USE_PARALLEL & (self.parallel_jobs > 1)):
            if self.verbose:
                print "<laterality.py> Starting parallel code. Processes:", \
                    self.parallel_jobs

            # compare to right hemisphere (reflect fiber first if in left hem)
            ret = Parallel(
                n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)(
                    delayed(similarity.total_similarity_for_laterality)(
                        self.fibers.get_fiber(lidx), fiber_array_right,
                        self.fibers.is_left_hem[lidx], self.threshold, sigmasq)
                    for lidx in self.fibers.index_hem)

            #ret = zip(*ret)
            right_hem_total[self.fibers.index_hem] = ret
            #right_hem_distance = ret[1]

            # compare to left hemisphere (reflect fiber first if in right hem)
            ret = Parallel(
                n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)(
                    delayed(similarity.total_similarity_for_laterality)
                    (self.fibers.get_fiber(lidx), fiber_array_left,
                     self.fibers.is_right_hem[lidx], self.threshold, sigmasq)
                    for lidx in self.fibers.index_hem)
            #ret = zip(*ret)
            left_hem_total[self.fibers.index_hem] = ret
            #left_hem_distance = ret[1]

        else:
            right_hem_distance = numpy.zeros(
                [nf, len(self.fibers.index_right_hem)])
            left_hem_distance = numpy.zeros(
                [nf, len(self.fibers.index_left_hem)])

            # compare to right hemisphere (reflect fiber first if in left hem)
            for lidx in self.fibers.index_hem:
                ret = similarity.total_similarity_for_laterality(
                    self.fibers.get_fiber(lidx), fiber_array_right,
                    self.fibers.is_left_hem[lidx], self.threshold, sigmasq)
                right_hem_total[lidx] = ret
                #right_hem_total[lidx] = ret[0]
                #right_hem_distance[lidx,:] = ret[1]

            # compare to left hemisphere (reflect fiber first if in right hem)
            for lidx in self.fibers.index_hem:
                ret = similarity.total_similarity_for_laterality(
                    self.fibers.get_fiber(lidx), fiber_array_left,
                    self.fibers.is_right_hem[lidx], self.threshold, sigmasq)
                left_hem_total[lidx] = ret
                #left_hem_distance[lidx,:] = ret[1]

        laterality_index = compute_laterality_index(left_hem_total,
                                                    right_hem_total,
                                                    self.fibers.index_hem)

        # output the LI as cell data in the polydata
        # for visualization and/or further analyses
        cell_data = vtk.vtkFloatArray()
        cell_data.SetName('Laterality')
        for lidx in range(0, self.fibers.number_of_fibers):
            cell_data.InsertNextTuple1(laterality_index[lidx])
            input_vtk_polydata.GetCellData().SetScalars(cell_data)

        # output everything
        results = LateralityResults()
        results.laterality_index = laterality_index
        results.polydata = input_vtk_polydata
        #results.right_hem_distance = right_hem_distance
        #results.left_hem_distance = left_hem_distance
        results.sigma = self.sigma
        results.points_per_fiber = self.points_per_fiber
        results.threshold = self.threshold
        results.left_hem_similarity = left_hem_total
        results.right_hem_similarity = right_hem_total
        results.hemisphere = self.fibers.fiber_hemisphere
        return results
예제 #5
0
    def compute(self, input_vtk_polydata):
        """ Actually calculate the laterality index for every input
        fiber.

        Input polydata is required. This polydata is modified by
        adding a cell data array containing laterality indices.

        Output from this class is a struct: <io.py> class
        LateralityResults

        Parameters in the class can also be modified for experiments:
        sigma (in Gaussian on inter-fiber-point distance),
        points_per_fiber (for parameterization), threshold (below
        which inter-fiber-point distance is set to 0).

        Performance options are the number of parallel_jobs, and
        verbose (whether to print progress).

        """

        # internal representation for fast similarity computation
        # this also detects which hemisphere fibers are in
        self.fibers.points_per_fiber = self.points_per_fiber
        # must request hemisphere computation from object
        self.fibers.hemispheres = True
        # Now convert to array with points and hemispheres as above
        self.fibers.convert_from_polydata(input_vtk_polydata)
        
        # get the same number from each hemisphere if requested
        # -------------------------
        if self.use_equal_fibers:
            num_fibers = min(self.fibers.number_left_hem, self.fibers.number_right_hem)        
            if self.fibers_per_hemisphere is not None:
                if self.fibers_per_hemisphere <= num_fibers:
                    num_fibers = self.fibers_per_hemisphere
                else:
                    raise Exception("Fibers per hemisphere is set too high for the dataset. Current subject maximum is"+str(num_fibers))
        
            # grab num_fibers fibers from each hemisphere.
            # use the first n since they were randomly sampled from the whole dataset
            selected_right = self.fibers.index_right_hem[0:num_fibers]
            selected_left = self.fibers.index_left_hem[0:num_fibers]
            mask = numpy.zeros(input_vtk_polydata.GetNumberOfLines())
            mask[selected_right] = 1
            mask[selected_left] = 1
            # go back to the input data and use just those fibers
            input_vtk_polydata = filter.mask(input_vtk_polydata, mask)
            # Now convert to array with points and hemispheres as above
            self.fibers.convert_from_polydata(input_vtk_polydata)
            if self.verbose:
                print "<laterality.py> Using ", num_fibers , " fibers per hemisphere."
                
        # square sigma for later Gaussian
        sigmasq = self.sigma * self.sigma

        # allocate outputs
        nf = self.fibers.number_of_fibers
        laterality_index = numpy.zeros(nf)
        right_hem_total = numpy.zeros(nf)
        left_hem_total = numpy.zeros(nf)
        #right_hem_distance = numpy.zeros([nf, nf])
        #left_hem_distance = numpy.zeros([nf, nf])


        # grab all fibers from each hemisphere
        fiber_array_right = self.fibers.get_fibers(self.fibers.index_right_hem)
        fiber_array_left = self.fibers.get_fibers(self.fibers.index_left_hem)

        # tell user we are doing something
        if self.verbose:
            print "<laterality.py> Fibers in each hemisphere.", \
                "L:", self.fibers.number_left_hem, \
                "R:", self.fibers.number_right_hem, \
                "/ Total:", self.fibers.number_of_fibers
            print "<laterality.py> Starting to compute laterality indices"

        # run the computation, either in parallel or not
        if (USE_PARALLEL & (self.parallel_jobs > 1)):
            if self.verbose:
                print "<laterality.py> Starting parallel code. Processes:", \
                    self.parallel_jobs

            # compare to right hemisphere (reflect fiber first if in left hem)
            ret = Parallel(
                n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)(
                delayed(similarity.total_similarity_for_laterality)(
                    self.fibers.get_fiber(lidx),
                    fiber_array_right,
                    self.fibers.is_left_hem[lidx],
                    self.threshold,
                    sigmasq)
                for lidx in self.fibers.index_hem)

            #ret = zip(*ret)
            right_hem_total[self.fibers.index_hem] = ret
            #right_hem_distance = ret[1]

            # compare to left hemisphere (reflect fiber first if in right hem)
            ret = Parallel(
                n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)(
                delayed(similarity.total_similarity_for_laterality)(
                    self.fibers.get_fiber(lidx),
                    fiber_array_left,
                    self.fibers.is_right_hem[lidx],
                    self.threshold,
                    sigmasq)
                for lidx in self.fibers.index_hem)
            #ret = zip(*ret)
            left_hem_total[self.fibers.index_hem] = ret
            #left_hem_distance = ret[1]

        else:
            right_hem_distance = numpy.zeros([nf, len(self.fibers.index_right_hem)])
            left_hem_distance = numpy.zeros([nf, len(self.fibers.index_left_hem)])

            # compare to right hemisphere (reflect fiber first if in left hem)
            for lidx in self.fibers.index_hem:
                ret = similarity.total_similarity_for_laterality(
                    self.fibers.get_fiber(lidx),
                    fiber_array_right,
                    self.fibers.is_left_hem[lidx],
                    self.threshold,
                    sigmasq)
                right_hem_total[lidx] = ret
                #right_hem_total[lidx] = ret[0]
                #right_hem_distance[lidx,:] = ret[1]

            # compare to left hemisphere (reflect fiber first if in right hem)
            for lidx in self.fibers.index_hem:
                ret = similarity.total_similarity_for_laterality(
                    self.fibers.get_fiber(lidx),
                    fiber_array_left,
                    self.fibers.is_right_hem[lidx],
                    self.threshold,
                    sigmasq)
                left_hem_total[lidx] = ret
                #left_hem_distance[lidx,:] = ret[1]

        laterality_index = compute_laterality_index(left_hem_total,
                                                    right_hem_total,
                                                    self.fibers.index_hem)


        # output the LI as cell data in the polydata
        # for visualization and/or further analyses
        cell_data = vtk.vtkFloatArray()
        cell_data.SetName('Laterality')
        for lidx in range(0, self.fibers.number_of_fibers):
            cell_data.InsertNextTuple1(laterality_index[lidx])
            input_vtk_polydata.GetCellData().SetScalars(cell_data)

        # output everything
        results = LateralityResults()
        results.laterality_index = laterality_index
        results.polydata = input_vtk_polydata
        #results.right_hem_distance = right_hem_distance
        #results.left_hem_distance = left_hem_distance
        results.sigma = self.sigma
        results.points_per_fiber = self.points_per_fiber
        results.threshold = self.threshold
        results.left_hem_similarity = left_hem_total
        results.right_hem_similarity = right_hem_total
        results.hemisphere = self.fibers.fiber_hemisphere
        return results
예제 #6
0
def output_and_quality_control_cluster_atlas(atlas, output_polydata_s, subject_fiber_list, input_polydatas, number_of_subjects, outdir, cluster_numbers_s, color, embed, number_of_fibers_to_display, testing=False, verbose=False, render_images=True):

    """Save the output in our atlas format for automatic labeling of clusters.

    First save the atlas.vtp and atlas.p datasets. This is the data used to
    label a new subject.  Also write the polydata with cluster indices
    saved as cell data. This is a one-file output option for clusters.
    Finally, save some quality control metrics and save the atlas
    clusters as individual polydatas. This is used to set up a mrml
    hierarchy file and to visualize the output in Slicer. This data is
    not used to label a new subject.
    """

    # Write additional output for software testing if code has changed (requires random seed to be constant)
    if testing:
        expected_results_file = os.path.join(outdir, 'test_cluster_atlas_numbers.pkl')
        pickle.dump(cluster_numbers_s, open(expected_results_file, 'wb'))
        expected_results_file = os.path.join(outdir, 'test_cluster_atlas_colors.pkl')
        pickle.dump(color, open(expected_results_file, 'wb'))
        expected_results_file = os.path.join(outdir, 'test_cluster_atlas_embeddings.pkl')
        pickle.dump(embed, open(expected_results_file, 'wb'))
         
    # Save the output in our atlas format for automatic labeling of full brain datasets.
    # This is the data used to label a new subject
    atlas.save(outdir,'atlas')

    # Write the polydata with cluster indices saved as cell data
    fname_output = os.path.join(outdir, 'clustered_whole_brain.vtp')
    io.write_polydata(output_polydata_s, fname_output)

    # output summary file to save information about all subjects
    subjects_qc_fname = os.path.join(outdir, 'input_subjects.txt')
    subjects_qc_file = open(subjects_qc_fname, 'w')
    outstr = "Subject_idx\tSubject_ID\tfilename\n"
    subjects_qc_file.write(outstr)
    idx = 1
    for fname in input_polydatas:
        subject_id = os.path.splitext(os.path.basename(fname))[0]
        outstr =  str(idx) + '\t' + str(subject_id) + '\t' + str(fname) + '\n'
        subjects_qc_file.write(outstr)
        idx += 1
    subjects_qc_file.close()

    # output summary file to save information about all clusters
    clusters_qc_fname = os.path.join(outdir, 'cluster_quality_control.txt')
    clusters_qc_file = open(clusters_qc_fname, 'w')

    # Figure out how many subjects in each cluster (ideally, most subjects in most clusters)
    subjects_per_cluster = list()
    percent_subjects_per_cluster = list()
    fibers_per_cluster = list()
    mean_fiber_len_per_cluster = list()
    std_fiber_len_per_cluster = list()
    mean_fibers_per_subject_per_cluster = list()
    std_fibers_per_subject_per_cluster = list()

    # find out length of each fiber
    fiber_length, step_size = filter.compute_lengths(output_polydata_s)

    # loop over each cluster and compute quality control metrics
    cluster_indices = range(atlas.centroids.shape[0])
    for cidx in cluster_indices:
        cluster_mask = (cluster_numbers_s==cidx) 
        subjects_per_cluster.append(len(set(subject_fiber_list[cluster_mask])))
        fibers_per_subject = list()
        for sidx in range(number_of_subjects):
            fibers_per_subject.append(list(subject_fiber_list[cluster_mask]).count(sidx))
        mean_fibers_per_subject_per_cluster.append(numpy.mean(numpy.array(fibers_per_subject)))
        std_fibers_per_subject_per_cluster.append(numpy.std(numpy.array(fibers_per_subject)))
        mean_fiber_len_per_cluster.append(numpy.mean(fiber_length[cluster_mask]))
        std_fiber_len_per_cluster.append(numpy.std(fiber_length[cluster_mask]))

    percent_subjects_per_cluster = numpy.divide(numpy.array(subjects_per_cluster),float(number_of_subjects))

    # Save output quality control information
    print "<cluster.py> Saving cluster quality control information file."
    clusters_qc_file = open(clusters_qc_fname, 'w')
    print >> clusters_qc_file, 'cluster_idx','\t', 'number_subjects','\t', 'percent_subjects','\t', 'mean_length','\t', 'std_length','\t', 'mean_fibers_per_subject','\t', 'std_fibers_per_subject'
    for cidx in cluster_indices:
        print >> clusters_qc_file, cidx + 1,'\t', subjects_per_cluster[cidx],'\t', percent_subjects_per_cluster[cidx] * 100.0,'\t', \
            mean_fiber_len_per_cluster[cidx],'\t', std_fiber_len_per_cluster[cidx],'\t', \
            mean_fibers_per_subject_per_cluster[cidx],'\t', std_fibers_per_subject_per_cluster[cidx]

    clusters_qc_file.close()

    if HAVE_PLT:
        print "<cluster.py> Saving subjects per cluster histogram."
        fig, ax = plt.subplots()
        counts = numpy.zeros(num_of_subjects+1)
        counts[:numpy.max(subjects_per_cluster)+1] = numpy.bincount(subjects_per_cluster)
        ax.bar(range(num_of_subjects + 1), counts, width=1, align='center')
        ax.set(xlim=[-1, num_of_subjects + 1])
        plt.title('Histogram of Subjects per Cluster')
        plt.xlabel('subjects per cluster')
        plt.ylabel('number of clusters')
        plt.savefig( os.path.join(outdir, 'subjects_per_cluster_hist.pdf'))
        plt.close()
        
    # Save the entire combined atlas as individual clusters for visualization
    # and labeling/naming of structures. This will include all of the data
    # that was clustered to make the atlas.

    # Figure out file name and mean color for each cluster, and write the individual polydatas
    print "<cluster.py> Beginning to save individual clusters as polydata files. TOTAL CLUSTERS:", len(cluster_indices),
    fnames = list()
    cluster_colors = list()
    cluster_sizes = list()
    cluster_fnames = list()
    for c in cluster_indices:
        print c,
        mask = cluster_numbers_s == c
        cluster_size = numpy.sum(mask)
        cluster_sizes.append(cluster_size)
        pd_c = filter.mask(output_polydata_s, mask,verbose=verbose)
        # color by subject so we can see which one it came from
        filter.add_point_data_array(pd_c, subject_fiber_list[mask], "Subject_ID")
        # Save hemisphere information into the polydata
        farray = fibers.FiberArray()
        farray.hemispheres = True
        farray.hemisphere_percent_threshold = 0.90
        farray.convert_from_polydata(pd_c, points_per_fiber=50)
        filter.add_point_data_array(pd_c, farray.fiber_hemisphere, "Hemisphere")
        # The clusters are stored starting with 1, not 0, for user friendliness.
        fname_c = 'cluster_{0:05d}.vtp'.format(c+1)
        # save the filename for writing into the MRML file
        fnames.append(fname_c)
        # prepend the output directory
        fname_c = os.path.join(outdir, fname_c)
        #print fname_c
        io.write_polydata(pd_c, fname_c)
        cluster_fnames.append(fname_c)
        if cluster_size > 0:
            color_c = color[mask,:]
            cluster_colors.append(numpy.mean(color_c,0))
        else:
            cluster_colors.append([0,0,0])
        del pd_c
    print "\n<cluster.py> Finishes saving individual clusters as polydata files."

    # Notify user if some clusters empty
    empty_count = 0
    for sz, fname in zip(cluster_sizes,cluster_fnames):
        if sz == 0:
            print sz, ":", fname
            empty_count += 1
    if empty_count:
        print "<cluster.py> Warning. Empty clusters found:", empty_count

    cluster_sizes = numpy.array(cluster_sizes)
    print "<cluster.py> Mean number of fibers per cluster:", numpy.mean(cluster_sizes), "Range:", numpy.min(cluster_sizes), "..", numpy.max(cluster_sizes)

    # Estimate subsampling ratio to display approximately number_of_fibers_to_display total fibers in 3D Slicer
    number_fibers = len(cluster_numbers_s)
    if number_fibers < number_of_fibers_to_display:
        ratio = 1.0
    else:
        ratio = number_of_fibers_to_display / number_fibers
    print "<cluster.py> Subsampling ratio for display of", number_of_fibers_to_display, "total fibers estimated as:", ratio

    # Write the MRML file into the directory where the polydatas were already stored
    fname = os.path.join(outdir, 'clustered_tracts.mrml')
    mrml.write(fnames, numpy.around(numpy.array(cluster_colors), decimals=3), fname, ratio=ratio)

    # Also write one with 100% of fibers displayed
    fname = os.path.join(outdir, 'clustered_tracts_display_100_percent.mrml')
    mrml.write(fnames, numpy.around(numpy.array(cluster_colors), decimals=3), fname, ratio=1.0)
    
    # View the whole thing in jpg format for quality control
    if render_images:
        print '<cluster.py> Rendering and saving images of cluster atlas.'
        ren = render.render(output_polydata_s, 1000, data_mode='Cell', data_name='EmbeddingColor', verbose=verbose)
        ren.save_views(outdir, verbose=verbose)
        del ren
예제 #7
0
def spectral(input_polydata, number_of_clusters=200,
             number_of_eigenvectors=20, sigma=60, threshold=0.0,
             number_of_jobs=3, use_nystrom=False, nystrom_mask=None,
             landmarks=None, distance_method='Mean', normalized_cuts=True,
             outlier_std_threshold = 2.0,
             pos_def_approx=True,
             bilateral=False):

    """ Spectral clustering based on pairwise fiber affinity matrix.

    As in O'Donnell and Westin TMI 2007.

    Differences from that implementation: fiber point correspondences are defined
    using fixed-length fiber parameterization (instead of closest point).

    """

    # test pd has lines first
    number_fibers = input_polydata.GetNumberOfLines()
    print "<cluster.py> Starting spectral clustering."
    print "<cluster.py> Number of input fibers:", number_fibers
    print "<cluster.py> Number of clusters:", number_of_clusters

    if number_fibers == 0:
        print "<cluster.py> ERROR: Cannot cluster polydata with 0 fibers."
        return

    atlas = ClusterAtlas() 

    # Store all parameters to this function. They must be identical later to label new data.
    # Below, calculated values will also be stored in the atlas.
    atlas.number_of_eigenvectors = number_of_eigenvectors
    atlas.sigma = sigma
    atlas.threshold = threshold
    atlas.use_nystrom = use_nystrom
    atlas.landmarks = landmarks
    atlas.distance_method = distance_method
    atlas.bilateral = bilateral

    # 1) Compute fiber similarities.
    # Nystrom version of the code uses a sample of the data.
    if use_nystrom:
        # make sure it's an array for logic operations
        nystrom_mask = numpy.array(nystrom_mask)
        # make sure it's boolean or 0 and 1
        test = numpy.max(nystrom_mask) == 1.0
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask is may not be Boolean. Max value is not 1.0/True."
            raise AssertionError
        # make sure it's large enough
        test = sum(nystrom_mask) >= 100
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask is smaller than 100."
            raise AssertionError
        # make sure its size matches the polydata input
        test = len(nystrom_mask) == number_fibers
        if not test:
            print "<cluster.py> ERROR: Nystrom data mask size does not match polydata number of lines."
            raise AssertionError
        
        # Separate the Nystrom sample and the rest of the data.
        polydata_m = filter.mask(input_polydata, nystrom_mask, verbose=False)
        atlas.nystrom_polydata = polydata_m
        polydata_n = filter.mask(input_polydata, nystrom_mask == False, verbose=False)
        sz = polydata_m.GetNumberOfLines()
        print '<cluster.py> Using Nystrom approximation. Subset size:',  sz, '/', number_fibers
        # Determine ordering to get embedding to correspond to original input data.
        reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask)[0], numpy.where(~nystrom_mask)[0]))
        if landmarks is not None:
            landmarks_m = landmarks[nystrom_mask,:,:]
            landmarks_n = landmarks[~nystrom_mask,:,:]
        else:
            landmarks_m = landmarks_n = None

        # Calculate fiber similarities
        A = \
            _pairwise_similarity_matrix(polydata_m, threshold,
                                        sigma, number_of_jobs, landmarks_m, distance_method, bilateral)
        B = \
            _rectangular_similarity_matrix(polydata_n, polydata_m, threshold,
                                           sigma, number_of_jobs, landmarks_n, landmarks_m, distance_method, bilateral)

        # sanity check
        print "<cluster.py> Range of values in A:", numpy.min(A), numpy.max(A)
        print "<cluster.py> Range of values in B:", numpy.min(B), numpy.max(B)
        
    else:
        # Calculate all fiber similarities
        A = \
            _pairwise_similarity_matrix(input_polydata, threshold,
                                    sigma, number_of_jobs, landmarks, distance_method, bilateral)

        atlas.nystrom_polydata = input_polydata
        # sanity check
        print "<cluster.py> Range of values in A:", numpy.min(A), numpy.max(A)
        
    testval = numpy.max(A-A.T) 
    if not testval == 0.0:
        if testval > 1e-10:
            print "<cluster.py> ERROR: A matrix is not symmetric."
            raise AssertionError
        else:
            print "<cluster.py> Maximum of A - A^T:", testval
        # Ensure that A is symmetric
        A = numpy.divide(A+A.T, 2.0)
        
    testval = numpy.min(A)
    if not testval > 0.0:
        print "<cluster.py> ERROR: A matrix is not positive."
        print "<cluster.py> Minimum value in A: ", testval
        if testval < 0.0:
            raise AssertionError

    # Outliers will have low measured (or estimated) row sums. Detect outliers in A:
    # to turn off for testing: outlier_std_threshold = numpy.inf
    row_sum_A_initial = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0)
    print "<cluster.py> Initial similarity (row) sum A:", numpy.mean(row_sum_A_initial), numpy.std(row_sum_A_initial), numpy.min(row_sum_A_initial)
    atlas.outlier_std_threshold = outlier_std_threshold
    atlas.row_sum_threshold_for_rejection = numpy.mean(row_sum_A_initial) - outlier_std_threshold*numpy.std(row_sum_A_initial)
    bad_idx = numpy.nonzero(row_sum_A_initial < atlas.row_sum_threshold_for_rejection)[0]
    reject_A = bad_idx
    print "<cluster.py> Rejecting n=", len(bad_idx), "/", sz, "fibers >", outlier_std_threshold, "standard deviations below the mean total fiber similarity"

    A = numpy.delete(A,reject_A,0)
    A = numpy.delete(A,reject_A,1)
    #print A.shape, B.shape
    B = numpy.delete(B,reject_A,0)
    #print A.shape, B.shape, reorder_embedding.shape
                    
    # Ensure that A is positive definite.
    if pos_def_approx:
        e_val, e_vec = numpy.linalg.eigh(A)
        print "<cluster.py> Eigenvalue range of A:", e_val[0], e_val[-1]
        A2 = nearPSD(A)
        e_val, e_vec = numpy.linalg.eigh(A2)
        print "<cluster.py> Eigenvalue range of nearest PSD matrix to A:", e_val[0], e_val[-1]  
        testval = numpy.max(A-A2) 
        if not testval == 0.0:
            print "<cluster.py> A matrix differs by PSD matrix by maximum of:", testval
            if testval > 0.25:
                print "<cluster.py> ERROR: A matrix changed by more than 0.25."
                raise AssertionError
        A = A2
        
    # 2) Do Normalized Cuts transform of similarity matrix.
    # See the paper: "Spectral Grouping Using the Nystrom Method"
    # (D^-1/2 W D^-1/2) V = V Lambda
    if normalized_cuts:
        if use_nystrom:
            # Form of entire affinity matrix: 
            # A   B
            # B^T   C
            # C is not computed.
            # Calculate the sum of the partial rows we've computed:
            atlas.row_sum_1 = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0)  
            #print "<cluster.py> A size:", A.shape
            #print "<cluster.py> B size:", B.shape
            #print "<cluster.py> A-B matrix row sums range (should be > 0):", numpy.min(atlas.row_sum_1), numpy.max(atlas.row_sum_1)
            
            # Approximate the sum of the rest of the data (including C)
            # These are weighted sums of the columns we did compute
            # where the weight depends on how similar that fiber 
            # was to each path in A.  This uses the dual basis
            # of the columns in A.
            # Approximate the inverse of A for dual basis
            #print "<cluster.py> Using numpy linalg pinv A"
            atlas.pinv_A = numpy.linalg.pinv(A)

            #e_val, e_vec = numpy.linalg.eigh(atlas.pinv_A)
            #print "<cluster.py> test of non-normalized A pseudoinverse Eigenvalue range:", e_val[0], e_val[-1]  

            # row sum formula:
            # dhat = [a_r + b_r; b_c + B^T*A-1*b_r]
            # this matrix is A^-1 * b_r, where b_r are the row sums of B
            # matlab was: atlas.approxRowSumMatrix = sum(B',1)*atlas.pseudoInverseA;
            atlas.row_sum_matrix = numpy.dot(numpy.sum(B.T, axis=0), atlas.pinv_A)
            #test = numpy.sum(B.T, axis=0)
            #print "<cluster.py> B column sums range (should be > 0):", numpy.min(test), numpy.max(test)
            print "<cluster.py> Range of row sum weights:", numpy.min(atlas.row_sum_matrix), numpy.max(atlas.row_sum_matrix)
            #print "<cluster.py> First 10 entries in weight matrix:", atlas.row_sum_matrix[0:10]
            #test = numpy.dot(atlas.row_sum_matrix, B)
            #print "<cluster.py> Test partial sum estimation for B:", numpy.min(test), numpy.max(test)
            #del test
            
            # row sum estimate for current B part of the matrix
            row_sum_2 = numpy.sum(B, axis=0) + \
                numpy.dot(atlas.row_sum_matrix, B)
            print "<cluster.py> Row sum check (min/max, should be > 0) A:", numpy.min(atlas.row_sum_1), numpy.median(atlas.row_sum_1), numpy.max(atlas.row_sum_1),  "B:", numpy.min(row_sum_2), numpy.median(row_sum_2), numpy.max(row_sum_2)

            # reject outliers in B
            bad_idx = numpy.nonzero(row_sum_2 < atlas.row_sum_threshold_for_rejection)[0]
            reject_B = bad_idx
            print "<cluster.py> Rejecting n=", len(bad_idx), "/", B.shape[1], "fibers >", outlier_std_threshold, "standard deviations below the mean total fiber similarity"
            row_sum_2 = numpy.delete(row_sum_2,reject_B)
            B = numpy.delete(B,reject_B,1)

            print "<cluster.py> After outlier rejection A:", A.shape, "B:", B.shape
            print "<cluster.py> Row sum check (min/max, should be > 0) A:", numpy.min(atlas.row_sum_1), numpy.median(atlas.row_sum_1), numpy.max(atlas.row_sum_1),  "B:", numpy.min(row_sum_2), numpy.median(row_sum_2), numpy.max(row_sum_2)

            # Separate the Nystrom sample and the rest of the data after removing outliers
            nystrom_mask_2 = nystrom_mask
            midxA = numpy.nonzero(nystrom_mask_2)[0]
            nystrom_mask_2[midxA[reject_A]] = False
            not_nystrom_mask = nystrom_mask == False
            not_nystrom_mask[midxA[reject_A]] = False
            midxB = numpy.nonzero(not_nystrom_mask)[0]
            not_nystrom_mask[midxB[reject_B]] = False

            polydata_m = filter.mask(input_polydata, nystrom_mask_2, verbose=False)
            atlas.nystrom_polydata = polydata_m
            polydata_n = filter.mask(input_polydata, not_nystrom_mask, verbose=False)
            output_polydata = filter.mask(input_polydata, numpy.add(nystrom_mask_2, not_nystrom_mask),verbose=False)
            sz = polydata_m.GetNumberOfLines()
            number_fibers = output_polydata.GetNumberOfLines()
            print '<cluster.py> Using Nystrom approximation. Subset size (A):',  sz, '/', number_fibers, "B:", polydata_n.GetNumberOfLines()
            # Determine ordering to get embedding to correspond to original input data.
            # reject outliers from masks
            reject_idx = numpy.concatenate((midxA[reject_A],midxB[reject_B]))
            nystrom_mask_2 = numpy.delete(nystrom_mask_2,reject_idx)
            not_nystrom_mask = numpy.delete(not_nystrom_mask,reject_idx)
            #print "hi after mask:", reorder_embedding.shape, numpy.sum(nystrom_mask_2), numpy.sum(not_nystrom_mask)
            reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask_2)[0], numpy.where(not_nystrom_mask)[0]))
            #print "hi after embed reorder calc:", reorder_embedding.shape, numpy.max(reorder_embedding), numpy.min(reorder_embedding)
           
            # in case of negative row sum estimation
            if any(row_sum_2<=0):
                print "<cluster.py> Warning: Consider increasing sigma or using the Mean distance. negative row sum approximations."
                print "Number of negative row sums:", numpy.count_nonzero(row_sum_2<=0)
                #row_sum_2[row_sum_2<0] = 0.1

            # save for testing
            column_sum = numpy.concatenate((numpy.sum(A, axis=1) , numpy.sum(B.T, axis=1)))

            # normalized cuts normalization
            row_sum = numpy.concatenate((atlas.row_sum_1, row_sum_2))
            dhat = numpy.sqrt(numpy.divide(1, row_sum))
            #dhat = numpy.sqrt(numpy.divide(1, numpy.concatenate((atlas.row_sum_1, row_sum_2))))

            A = \
                numpy.multiply(A, numpy.outer(dhat[0:sz], dhat[0:sz].T))
            B = \
                numpy.multiply(B, numpy.outer(dhat[0:sz], dhat[sz:].T))

        else:
            # normalized cuts normalization using row (same as column) sums
            row_sum = numpy.sum(A, axis=0)
            print "<cluster.py> A matrix row sums range (should be > 0):", numpy.min(row_sum), numpy.max(row_sum)
            dhat = numpy.divide(1, numpy.sqrt(row_sum))
            A = \
                numpy.multiply(A, numpy.outer(dhat, dhat.T))

    # 3) Compute eigenvectors for use in spectral embedding
    print '<cluster.py> Calculating eigenvectors of similarity matrix A...'
    atlas.e_val, atlas.e_vec = numpy.linalg.eigh(A)
    print '<cluster.py> Done calculating eigenvectors.'
    print "<cluster.py> Eigenvalue range:", atlas.e_val[0], atlas.e_val[-1]    
    # Check how well our chosen number of eigenvectors models the data
    power = numpy.cumsum(atlas.e_val[::-1]) / numpy.sum(atlas.e_val)
    print "<cluster.py> Power from chosen number of eigenvectors (", number_of_eigenvectors, ')', power[number_of_eigenvectors]
    print '<cluster.py> Top eigenvalues:', atlas.e_val[::-1][1:number_of_eigenvectors]

    # 4) Compute embedding using eigenvectors
    print('<cluster.py> Compute embedding using eigenvectors.')
    if use_nystrom:
        # Create embedding vectors using nystrom approximation to find
        # the approximate top eigenvectors of the matrix
        # L = D^(-1/2) (D - W) D^(-1/2) 
        # See the paper:
        # "Spectral Grouping Using the Nystrom Method"
        # Basically all this does is adds in the extra measurements
        # by projecting them onto the original eigenvector basis.
        # A=UVU' => U = AUV^-1 => take new rows of extended A (B') and do
        # the same.  U' = [AUV^-1 ; B'UV^-1] = [U ; B'UV^-1]
        # Note they divide embedding by 1st eigenvector rather
        # than sqrt of row sum, as in this code (below).

        # matlab was: % project onto eigenvectors of A:
        # % v' = [v ; B'*v*d^-1 
        # V = [atlas.eigenvectA; B'*atlas.eigenvectA*(diag(1./diag(atlas.eigenvalA)))];
        V = numpy.concatenate((atlas.e_vec, \
                                  numpy.dot(numpy.dot(B.T, atlas.e_vec), \
                                                numpy.diag(numpy.divide(1.0, atlas.e_val)))))

        # normalize estimated eigenvectors to have length of one
        # matlab was:
        # atlas.eigenvectorLengthToNormalize=sqrt(sum(V.*V));
        # V=V./repmat(atlas.eigenvectorLengthToNormalize,length(V),1);
        atlas.e_vec_norm = numpy.sum(numpy.multiply(V, V),0)
        V = numpy.divide(V, atlas.e_vec_norm)

        # Normalize each embedding vector by first eigenvector. Matlab code was:
        # for i = 2:embedLength+1
        #    embedding(:,i-1) = V(:,i)./V(:,1);
        # end
        # This eigenvector corresponds to an eigenvalue of 1, since row sums are 1.
        # The other option from the literature was to use this:
        # embedding_i,j = V_i+i,j./sqrt(D_j,j)
        embed = numpy.zeros((number_fibers, number_of_eigenvectors))
        #print "Hi 3:", embed.shape, number_fibers, reorder_embedding.shape, V.shape, A.shape, B.shape
        for i in range(0, number_of_eigenvectors):
            embed[reorder_embedding,i] = numpy.divide(V[:,-(i+2)], V[:,-1])
    else:
        embed = atlas.e_vec[:, -number_of_eigenvectors - 2: -2]
        embed = numpy.divide(embed.T, atlas.e_vec[:, -1]).T
        # reverse order of embedding so highest eigenvalue
        # information is first
        embed = embed[:, ::-1]


    # Default is always k-means. Other code is just left for testing. Did not improve results.
    #centroid_finder = 'AffinityPropagation'
    centroid_finder = 'K-means'
    
    # 5) Find clusters using k-means in embedding space.
    cluster_metric = None
    if centroid_finder == 'K-means':
        print '<cluster.py> K-means clustering in embedding space.'
        centroids, cluster_metric = scipy.cluster.vq.kmeans2(embed, number_of_clusters, minit='points')
        # sort centroids by first eigenvector order
        # centroid_order = numpy.argsort(centroids[:,0])
        # sort centroids according to colormap and save them in this order in atlas
        color = _embed_to_rgb(centroids)
        centroid_order = render.argsort_by_jet_lookup_table(color)
        atlas.centroids = centroids[centroid_order,:]
        cluster_idx, dist = scipy.cluster.vq.vq(embed, atlas.centroids)
        #print "<cluster.py> Distortion metric:", cluster_metric
        if 0:
            # This is extremely slow, but leave code here if ever wanted for testing
            cluster_metric = metrics.silhouette_score(embed, cluster_idx, metric='sqeuclidean')
            print("Silhouette Coefficient: %0.3f" % cluster_metric)
 
    else:
        print "ERROR: Unknown centroid finder", centroid_finder
        ## # This found fewer clusters than we need to represent the anatomy well
        ## # Leave code here in case wanted in future for more testing.
        ## print '<cluster.py> Affinity Propagation clustering in embedding space.'
        ## af = AffinityPropagation(preference=-50).fit(embed)
        ## cluster_centers_indices = af.cluster_centers_indices_
        ## labels = af.labels_
        ## n_clusters_ = len(cluster_centers_indices)
        ## print('Estimated number of clusters: %d' % n_clusters_)
        ## cluster_idx = labels
        ## for k in range(n_clusters_):
        ##     class_members = labels == k
        ##     atlas.centroids = embed[cluster_centers_indices[k]]
        ## # return metrics
        ## if 0:
        ##     # This is extremely slow, but leave code here if ever wanted for testing
        ##     cluster_metric = metrics.silhouette_score(embed, labels, metric='sqeuclidean')
        ##     print("Silhouette Coefficient: %0.3f" % cluster_metric)

    # 6) Output results.
    print '<cluster.py> Done spectral clustering, returning results.'
    # visualize embedding coordinates as RGB
    embed2 = embed
    #embed2[numpy.isnan(embed)] = 0.0
    color = _embed_to_rgb(embed2)
    # set up polydata with clustering output info.
    # for now modify input polydata by adding two arrays
    #output_polydata = input_polydata
    if use_nystrom:
        output_polydata = \
            _format_output_polydata(output_polydata, cluster_idx, color, embed, row_sum[reorder_embedding], column_sum[reorder_embedding])
    else:
        # row and column sums are the same. no need to reorder.
        output_polydata = \
            _format_output_polydata(output_polydata, cluster_idx, color, embed, row_sum, row_sum)
            
    return output_polydata, cluster_idx, color, embed, cluster_metric, atlas, reject_idx