def view_cluster_number(input_polydata, cluster_number, cluster_indices=None): """ Pop up a render window showing the selected cluster. Uses cluster_indices to choose corresponding cells in the polydata. If no argument cluster_indices is provided, then uses the cell data array named ClusterNumber. One of these inputs must be present. """ if cluster_indices == None: cluster_indices_vtk = \ input_polydata.GetCellData().GetArray('ClusterNumber') cluster_indices = numpy.zeros(cluster_indices_vtk.GetNumberOfTuples()) for fidx in range(0, cluster_indices_vtk.GetNumberOfTuples()): cluster_indices[fidx] = cluster_indices_vtk.GetTuple(fidx)[0] fiber_mask = cluster_indices == cluster_number view_polydata = filter.mask(input_polydata, fiber_mask) ren = render.render(view_polydata) return ren
def spectral(input_polydata, number_of_clusters=300, number_of_eigenvectors=10, sigma=20, threshold=2, number_of_jobs=3, use_nystrom=False, nystrom_mask = None, landmarks=None): """ Spectral clustering based on pairwise fiber affinity matrix. As in O'Donnell and Westin TMI 2007. Differences from that implementation: fiber distance is defined using fixed-length fiber parameterization. """ # test pd has lines first number_fibers = input_polydata.GetNumberOfLines() print "<cluster.py> Starting spectral clustering." print "<cluster.py> Number of input fibers:", number_fibers print "<cluster.py> Number of clusters:", number_of_clusters if number_fibers == 0: print "<cluster.py> ERROR: Cannot cluster polydata with 0 fibers." return atlas = ClusterAtlas() # 1) Compute fiber similarities. # Nystrom version of the code uses a sample of the data. if use_nystrom: # make sure it's an array for logic operations nystrom_mask = numpy.array(nystrom_mask) # make sure it's boolean or 0 and 1 test = numpy.max(nystrom_mask) == 1.0 if not test: print "<cluster.py> ERROR: Nystrom data mask is may not be Boolean. Max value is not 1.0/True." raise AssertionError # make sure it's large enough test = sum(nystrom_mask) >= 100 if not test: print "<cluster.py> ERROR: Nystrom data mask is smaller than 100." raise AssertionError # make sure its size matches the polydata input test = len(nystrom_mask) == number_fibers if not test: print "<cluster.py> ERROR: Nystrom data mask size does not match polydata number of lines." raise AssertionError # Separate the Nystrom sample and the rest of the data. polydata_m = filter.mask(input_polydata, nystrom_mask) atlas.nystrom_polydata = polydata_m atlas.threshold = threshold polydata_n = filter.mask(input_polydata, nystrom_mask == False) sz = polydata_m.GetNumberOfLines() print '<cluster.py> Using Nystrom approximation. Subset size:', sz, '/', number_fibers # Determine ordering to get embedding to correspond to original input data. reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask)[0], numpy.where(~nystrom_mask)[0])) if landmarks is not None: landmarks_m = landmarks[nystrom_mask,:,:] landmarks_n = landmarks[~nystrom_mask,:,:] else: landmarks_m = landmarks_n = None # Calculate fiber similarities A = \ _pairwise_similarity_matrix(polydata_m, threshold, sigma, number_of_jobs, landmarks_m) B = \ _rectangular_similarity_matrix(polydata_n, polydata_m, threshold, sigma, number_of_jobs, landmarks_n, landmarks_m) atlas.sigma = sigma else: # Calculate all fiber similarities A = \ _pairwise_similarity_matrix(input_polydata, threshold, sigma, number_of_jobs, landmarks) # 2) Do Normalized Cuts transform of similarity matrix. # See the paper: "Spectral Grouping Using the Nystrom Method" # (D^-1/2 W D^-1/2) V = V Lambda if use_nystrom: # calculate the sum of the rows we know from the full matrix atlas.row_sum_1 = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0) # approximate the sum of the columns # weights for approximating row sum use the known columns of B' # pseudo inverse of A is needed for atlas (?) atlas.pinv_A = numpy.linalg.pinv(A) e_val, e_vec = numpy.linalg.eigh(atlas.pinv_A) print "test of non-normalized A pseudoinverse Eigenvalue range:", e_val[0], e_val[-1] # matlab was: atlas.approxRowSumMatrix = sum(B',1)*atlas.pseudoInverseA; # this matrix is needed for atlas. atlas.row_sum_matrix = numpy.dot(numpy.sum(B.T, axis=0), atlas.pinv_A) # row sum estimate for current B part of the matrix row_sum_2 = numpy.sum(B, axis=0) + \ numpy.dot(atlas.row_sum_matrix, B) print "row sum check", numpy.min(atlas.row_sum_1), \ numpy.max(atlas.row_sum_1), numpy.min(row_sum_2), \ numpy.max(row_sum_2) print "check 2:", numpy.min(numpy.sum(B.T, axis=0)), numpy.min(atlas.row_sum_matrix) # normalized cuts normalization dhat = numpy.sqrt(numpy.divide(1, numpy.concatenate((atlas.row_sum_1, row_sum_2)))) A = \ numpy.multiply(A, numpy.outer(dhat[0:sz], dhat[0:sz].T)) B = \ numpy.multiply(B, numpy.outer(dhat[0:sz], dhat[sz:].T)) else: # normalized cuts normalization using row (same as column) sums row_sum = numpy.sum(A, axis=0) dhat = numpy.divide(1, numpy.sqrt(row_sum)) A = \ numpy.multiply(A, numpy.outer(dhat, dhat.T)) # 3) Compute eigenvectors for use in spectral embedding print '<cluster.py> Calculating eigenvectors of similarity matrix A...' atlas.e_val, atlas.e_vec = numpy.linalg.eigh(A) print '<cluster.py> Done calculating eigenvectors.' print "<cluster.py> Eigenvalue range:", atlas.e_val[0], atlas.e_val[-1] # Check how well our chosen number of eigenvectors models the data power = numpy.cumsum(atlas.e_val[::-1]) / numpy.sum(atlas.e_val) print "<cluster.py> Power from chosen number of eigenvectors (", number_of_eigenvectors, ')', power[number_of_eigenvectors] print '<cluster.py> Top eigenvalues:', atlas.e_val[::-1][1:number_of_eigenvectors] # 4) Compute embedding using eigenvectors print('<cluster.py> Compute embedding using eigenvectors.') if use_nystrom: # Create embedding vectors using nystrom approximation to find # the approximate top eigenvectors of the matrix # L = D^(-1/2) (D - W) D^(-1/2) # See the paper: # "Spectral Grouping Using the Nystrom Method" # Basically all this does is adds in the extra measurements # by projecting them onto the original eigenvector basis. # A=UVU' => U = AUV^-1 => take new rows of extended A (B') and do # the same. U' = [AUV^-1 ; B'UV^-1] = [U ; B'UV^-1] # Note they divide embedding by 1st eigenvector rather # than sqrt of row sum, as in this code (below). # matlab was: % project onto eigenvectors of A: # % v' = [v ; B'*v*d^-1 # V = [atlas.eigenvectA; B'*atlas.eigenvectA*(diag(1./diag(atlas.eigenvalA)))]; V = numpy.concatenate((atlas.e_vec, \ numpy.dot(numpy.dot(B.T, atlas.e_vec), \ numpy.diag(numpy.divide(1.0, atlas.e_val))))) # normalize estimated eigenvectors to have length of one # matlab was: # atlas.eigenvectorLengthToNormalize=sqrt(sum(V.*V)); # V=V./repmat(atlas.eigenvectorLengthToNormalize,length(V),1); atlas.e_vec_norm = numpy.sum(numpy.multiply(V, V),0) V = numpy.divide(V, atlas.e_vec_norm) # Normalize each embedding vector by first eigenvector. Matlab code was: # for i = 2:embedLength+1 # embedding(:,i-1) = V(:,i)./V(:,1); # end # This eigenvector corresponds to an eigenvalue of 1, since row sums are 1. # The other option from the literature was to use this: # embedding_i,j = V_i+i,j./sqrt(D_j,j) embed = numpy.zeros((number_fibers, number_of_eigenvectors)) for i in range(0, number_of_eigenvectors): embed[reorder_embedding,i] = numpy.divide(V[:,-(i+2)], V[:,-1]) else: embed = atlas.e_vec[:, -number_of_eigenvectors - 2: -2] embed = numpy.divide(embed.T, atlas.e_vec[:, -1]).T # reverse order of embedding so highest eigenvalue # information is first embed = embed[:, ::-1] atlas.number_of_eigenvectors = number_of_eigenvectors # 5) Find clusters using k-means in embedding space. print '<cluster.py> K-means clustering in embedding space.' atlas.centroids, distortion = scipy.cluster.vq.kmeans(embed, number_of_clusters) cluster_idx, dist = scipy.cluster.vq.vq(embed, atlas.centroids) # 6) Output results. print '<cluster.py> Done spectral clustering, returning results.' # visualize embedding coordinates as RGB embed2 = embed embed2[numpy.isnan(embed)] = 0.0 color = _embed_to_rgb(embed2) # set up polydata with clustering output info. # for now modify input polydata by adding two arrays output_polydata = input_polydata output_polydata = \ _format_output_polydata(output_polydata, cluster_idx, color, embed) return output_polydata, cluster_idx, color, embed, distortion, atlas
def spectral(input_polydata, number_of_clusters=200, number_of_eigenvectors=20, sigma=60, threshold=0.0, number_of_jobs=3, use_nystrom=False, nystrom_mask=None, landmarks=None, distance_method='Mean', normalized_cuts=True, bilateral=False): """ Spectral clustering based on pairwise fiber affinity matrix. As in O'Donnell and Westin TMI 2007. Differences from that implementation: fiber point correspondences are defined using fixed-length fiber parameterization (instead of closest point). """ # test pd has lines first number_fibers = input_polydata.GetNumberOfLines() print "<cluster.py> Starting spectral clustering." print "<cluster.py> Number of input fibers:", number_fibers print "<cluster.py> Number of clusters:", number_of_clusters if number_fibers == 0: print "<cluster.py> ERROR: Cannot cluster polydata with 0 fibers." return atlas = ClusterAtlas() # Store all parameters to this function. They must be identical later to label new data. # Below, calculated values will also be stored in the atlas. atlas.number_of_eigenvectors = number_of_eigenvectors atlas.sigma = sigma atlas.threshold = threshold atlas.use_nystrom = use_nystrom atlas.landmarks = landmarks atlas.distance_method = distance_method atlas.bilateral = bilateral # 1) Compute fiber similarities. # Nystrom version of the code uses a sample of the data. if use_nystrom: # make sure it's an array for logic operations nystrom_mask = numpy.array(nystrom_mask) # make sure it's boolean or 0 and 1 test = numpy.max(nystrom_mask) == 1.0 if not test: print "<cluster.py> ERROR: Nystrom data mask is may not be Boolean. Max value is not 1.0/True." raise AssertionError # make sure it's large enough test = sum(nystrom_mask) >= 100 if not test: print "<cluster.py> ERROR: Nystrom data mask is smaller than 100." raise AssertionError # make sure its size matches the polydata input test = len(nystrom_mask) == number_fibers if not test: print "<cluster.py> ERROR: Nystrom data mask size does not match polydata number of lines." raise AssertionError # Separate the Nystrom sample and the rest of the data. polydata_m = filter.mask(input_polydata, nystrom_mask) atlas.nystrom_polydata = polydata_m polydata_n = filter.mask(input_polydata, nystrom_mask == False) sz = polydata_m.GetNumberOfLines() print '<cluster.py> Using Nystrom approximation. Subset size:', sz, '/', number_fibers # Determine ordering to get embedding to correspond to original input data. reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask)[0], numpy.where(~nystrom_mask)[0])) if landmarks is not None: landmarks_m = landmarks[nystrom_mask,:,:] landmarks_n = landmarks[~nystrom_mask,:,:] else: landmarks_m = landmarks_n = None # Calculate fiber similarities A = \ _pairwise_similarity_matrix(polydata_m, threshold, sigma, number_of_jobs, landmarks_m, distance_method, bilateral) B = \ _rectangular_similarity_matrix(polydata_n, polydata_m, threshold, sigma, number_of_jobs, landmarks_n, landmarks_m, distance_method, bilateral) # sanity check print "Range of values in A:", numpy.min(A), numpy.max(A) print "Range of values in B:", numpy.min(B), numpy.max(B) else: # Calculate all fiber similarities A = \ _pairwise_similarity_matrix(input_polydata, threshold, sigma, number_of_jobs, landmarks, distance_method, bilateral) atlas.nystrom_polydata = input_polydata # sanity check print "Range of values in A:", numpy.min(A), numpy.max(A) testval = numpy.max(A-A.T) if not testval == 0.0: if testval > 1e-10: print "<cluster.py> ERROR: A matrix is not symmetric." raise AssertionError else: print "Maximum of A - A^T:", testval A = numpy.divide(A+A.T, 2.0) testval = numpy.min(A) if not testval > 0.0: print "<cluster.py> ERROR: A matrix is not positive." print "Minimum value in A: ", testval if testval < 0.0: raise AssertionError # 2) Do Normalized Cuts transform of similarity matrix. # See the paper: "Spectral Grouping Using the Nystrom Method" # (D^-1/2 W D^-1/2) V = V Lambda if normalized_cuts: if use_nystrom: # Form of entire affinity matrix: # A B # B^T C # C is not computed. # Calculate the sum of the partial rows we've computed: atlas.row_sum_1 = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0) print "A size:", A.shape print "B size:", B.shape print "row sum size:", atlas.row_sum_1.shape test = atlas.row_sum_1 print "A-B matrix row sums range (should be > 0):", numpy.min(atlas.row_sum_1), numpy.max(atlas.row_sum_1) # Approximate the sum of the rest of the data (including C) # These are weighted sums of the columns we did compute # where the weight depends on how similar that fiber # was to each path in A. This uses the dual basis # of the columns in A. # Approximate the inverse of A for dual basis # Use A's top eigenvectors (must have pos eigenvalues) # Construct an approximate inverse using the largest eigenvalues of A. pos_def_approx = True if pos_def_approx: print "Using A's top eigenvectors in pinv" numA = len(A) nvec = 40 if nvec > numA / 2.0: nvec = numpy.round(numA / 2) if nvec < number_of_eigenvectors + 1: nvec = number_of_eigenvectors + 1 val, vec = numpy.linalg.eigh(A) # numpy.dot(numpy.dot(vec,numpy.diag(val)),vec.T) ind = numpy.argsort(val) mask = ind[-nvec:-1] vec2 = vec[:,mask] val2 = val[mask] #A2 = numpy.dot(numpy.dot(vec2,numpy.diag(val2)),vec2.T) atlas.pinv_A = numpy.dot(numpy.dot(vec2,numpy.diag(numpy.divide(1.0,val2))),vec2.T) else: print "Using numpy linalg pinv A" atlas.pinv_A = numpy.linalg.pinv(A) e_val, e_vec = numpy.linalg.eigh(atlas.pinv_A) print "test of non-normalized A pseudoinverse Eigenvalue range:", e_val[0], e_val[-1] e_val, e_vec = numpy.linalg.eigh(A) print "Was A positive definite? Eigenvalue range of A:", e_val[0], e_val[-1] # row sum formula: # dhat = [a_r + b_r; b_c + B^T*A-1*b_r] # this matrix is A^-1 * b_r, where b_r are the row sums of B # matlab was: atlas.approxRowSumMatrix = sum(B',1)*atlas.pseudoInverseA; atlas.row_sum_matrix = numpy.dot(numpy.sum(B.T, axis=0), atlas.pinv_A) test = numpy.sum(B.T, axis=0) print "B column sums range (should be > 0):", numpy.min(test), numpy.max(test) print "Range of row sum weights:", numpy.min(atlas.row_sum_matrix), numpy.max(atlas.row_sum_matrix) print "First 10 entries in weight matrix:", atlas.row_sum_matrix[0:10] test = numpy.dot(atlas.row_sum_matrix, B) print "Test partial sum estimation for B:", numpy.min(test), numpy.max(test) # row sum estimate for current B part of the matrix row_sum_2 = numpy.sum(B, axis=0) + \ numpy.dot(atlas.row_sum_matrix, B) print "Row sum check (min/max, should be > 0) A:", numpy.min(atlas.row_sum_1), \ numpy.max(atlas.row_sum_1), "B:", numpy.min(row_sum_2), \ numpy.max(row_sum_2) print atlas.row_sum_1.shape print row_sum_2.shape # normalized cuts normalization dhat = numpy.sqrt(numpy.divide(1, numpy.concatenate((atlas.row_sum_1, row_sum_2)))) A = \ numpy.multiply(A, numpy.outer(dhat[0:sz], dhat[0:sz].T)) B = \ numpy.multiply(B, numpy.outer(dhat[0:sz], dhat[sz:].T)) else: # normalized cuts normalization using row (same as column) sums row_sum = numpy.sum(A, axis=0) dhat = numpy.divide(1, numpy.sqrt(row_sum)) A = \ numpy.multiply(A, numpy.outer(dhat, dhat.T)) # 3) Compute eigenvectors for use in spectral embedding print '<cluster.py> Calculating eigenvectors of similarity matrix A...' atlas.e_val, atlas.e_vec = numpy.linalg.eigh(A) print '<cluster.py> Done calculating eigenvectors.' print "<cluster.py> Eigenvalue range:", atlas.e_val[0], atlas.e_val[-1] # Check how well our chosen number of eigenvectors models the data power = numpy.cumsum(atlas.e_val[::-1]) / numpy.sum(atlas.e_val) print "<cluster.py> Power from chosen number of eigenvectors (", number_of_eigenvectors, ')', power[number_of_eigenvectors] print '<cluster.py> Top eigenvalues:', atlas.e_val[::-1][1:number_of_eigenvectors] # 4) Compute embedding using eigenvectors print('<cluster.py> Compute embedding using eigenvectors.') if use_nystrom: # Create embedding vectors using nystrom approximation to find # the approximate top eigenvectors of the matrix # L = D^(-1/2) (D - W) D^(-1/2) # See the paper: # "Spectral Grouping Using the Nystrom Method" # Basically all this does is adds in the extra measurements # by projecting them onto the original eigenvector basis. # A=UVU' => U = AUV^-1 => take new rows of extended A (B') and do # the same. U' = [AUV^-1 ; B'UV^-1] = [U ; B'UV^-1] # Note they divide embedding by 1st eigenvector rather # than sqrt of row sum, as in this code (below). # matlab was: % project onto eigenvectors of A: # % v' = [v ; B'*v*d^-1 # V = [atlas.eigenvectA; B'*atlas.eigenvectA*(diag(1./diag(atlas.eigenvalA)))]; V = numpy.concatenate((atlas.e_vec, \ numpy.dot(numpy.dot(B.T, atlas.e_vec), \ numpy.diag(numpy.divide(1.0, atlas.e_val))))) # normalize estimated eigenvectors to have length of one # matlab was: # atlas.eigenvectorLengthToNormalize=sqrt(sum(V.*V)); # V=V./repmat(atlas.eigenvectorLengthToNormalize,length(V),1); atlas.e_vec_norm = numpy.sum(numpy.multiply(V, V),0) V = numpy.divide(V, atlas.e_vec_norm) # Normalize each embedding vector by first eigenvector. Matlab code was: # for i = 2:embedLength+1 # embedding(:,i-1) = V(:,i)./V(:,1); # end # This eigenvector corresponds to an eigenvalue of 1, since row sums are 1. # The other option from the literature was to use this: # embedding_i,j = V_i+i,j./sqrt(D_j,j) embed = numpy.zeros((number_fibers, number_of_eigenvectors)) for i in range(0, number_of_eigenvectors): embed[reorder_embedding,i] = numpy.divide(V[:,-(i+2)], V[:,-1]) else: embed = atlas.e_vec[:, -number_of_eigenvectors - 2: -2] embed = numpy.divide(embed.T, atlas.e_vec[:, -1]).T # reverse order of embedding so highest eigenvalue # information is first embed = embed[:, ::-1] # Default is always k-means. Other code is just left for testing. Did not improve results. #centroid_finder = 'AffinityPropagation' centroid_finder = 'K-means' # 5) Find clusters using k-means in embedding space. cluster_metric = None if centroid_finder == 'K-means': print '<cluster.py> K-means clustering in embedding space.' atlas.centroids, cluster_metric = scipy.cluster.vq.kmeans(embed, number_of_clusters) cluster_idx, dist = scipy.cluster.vq.vq(embed, atlas.centroids) print "Distortion metric:", cluster_metric if 0: # This is extremely slow, but leave code here if ever wanted for testing cluster_metric = metrics.silhouette_score(embed, cluster_idx, metric='sqeuclidean') print("Silhouette Coefficient: %0.3f" % cluster_metric) else: # This found fewer clusters than we need to represent the anatomy well # Leave code here in case wanted in future for more testing. print '<cluster.py> Affinity Propagation clustering in embedding space.' af = AffinityPropagation(preference=-50).fit(embed) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print('Estimated number of clusters: %d' % n_clusters_) cluster_idx = labels for k in range(n_clusters_): class_members = labels == k atlas.centroids = embed[cluster_centers_indices[k]] # return metrics if 0: # This is extremely slow, but leave code here if ever wanted for testing cluster_metric = metrics.silhouette_score(embed, labels, metric='sqeuclidean') print("Silhouette Coefficient: %0.3f" % cluster_metric) # 6) Output results. print '<cluster.py> Done spectral clustering, returning results.' # visualize embedding coordinates as RGB embed2 = embed #embed2[numpy.isnan(embed)] = 0.0 color = _embed_to_rgb(embed2) # set up polydata with clustering output info. # for now modify input polydata by adding two arrays output_polydata = input_polydata output_polydata = \ _format_output_polydata(output_polydata, cluster_idx, color, embed) return output_polydata, cluster_idx, color, embed, cluster_metric, atlas
def compute(self, input_vtk_polydata): """ Actually calculate the laterality index for every input fiber. Input polydata is required. This polydata is modified by adding a cell data array containing laterality indices. Output from this class is a struct: <io.py> class LateralityResults Parameters in the class can also be modified for experiments: sigma (in Gaussian on inter-fiber-point distance), points_per_fiber (for parameterization), threshold (below which inter-fiber-point distance is set to 0). Performance options are the number of parallel_jobs, and verbose (whether to print progress). """ # internal representation for fast similarity computation # this also detects which hemisphere fibers are in self.fibers.points_per_fiber = self.points_per_fiber # must request hemisphere computation from object self.fibers.hemispheres = True # Now convert to array with points and hemispheres as above self.fibers.convert_from_polydata(input_vtk_polydata) # get the same number from each hemisphere if requested # ------------------------- if self.use_equal_fibers: num_fibers = min(self.fibers.number_left_hem, self.fibers.number_right_hem) if self.fibers_per_hemisphere is not None: if self.fibers_per_hemisphere <= num_fibers: num_fibers = self.fibers_per_hemisphere else: raise Exception( "Fibers per hemisphere is set too high for the dataset. Current subject maximum is" + str(num_fibers)) # grab num_fibers fibers from each hemisphere. # use the first n since they were randomly sampled from the whole dataset selected_right = self.fibers.index_right_hem[0:num_fibers] selected_left = self.fibers.index_left_hem[0:num_fibers] mask = numpy.zeros(input_vtk_polydata.GetNumberOfLines()) mask[selected_right] = 1 mask[selected_left] = 1 # go back to the input data and use just those fibers input_vtk_polydata = filter.mask(input_vtk_polydata, mask) # Now convert to array with points and hemispheres as above self.fibers.convert_from_polydata(input_vtk_polydata) if self.verbose: print "<laterality.py> Using ", num_fibers, " fibers per hemisphere." # square sigma for later Gaussian sigmasq = self.sigma * self.sigma # allocate outputs nf = self.fibers.number_of_fibers laterality_index = numpy.zeros(nf) right_hem_total = numpy.zeros(nf) left_hem_total = numpy.zeros(nf) #right_hem_distance = numpy.zeros([nf, nf]) #left_hem_distance = numpy.zeros([nf, nf]) # grab all fibers from each hemisphere fiber_array_right = self.fibers.get_fibers(self.fibers.index_right_hem) fiber_array_left = self.fibers.get_fibers(self.fibers.index_left_hem) # tell user we are doing something if self.verbose: print "<laterality.py> Fibers in each hemisphere.", \ "L:", self.fibers.number_left_hem, \ "R:", self.fibers.number_right_hem, \ "/ Total:", self.fibers.number_of_fibers print "<laterality.py> Starting to compute laterality indices" # run the computation, either in parallel or not if (USE_PARALLEL & (self.parallel_jobs > 1)): if self.verbose: print "<laterality.py> Starting parallel code. Processes:", \ self.parallel_jobs # compare to right hemisphere (reflect fiber first if in left hem) ret = Parallel( n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)( delayed(similarity.total_similarity_for_laterality)( self.fibers.get_fiber(lidx), fiber_array_right, self.fibers.is_left_hem[lidx], self.threshold, sigmasq) for lidx in self.fibers.index_hem) #ret = zip(*ret) right_hem_total[self.fibers.index_hem] = ret #right_hem_distance = ret[1] # compare to left hemisphere (reflect fiber first if in right hem) ret = Parallel( n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)( delayed(similarity.total_similarity_for_laterality) (self.fibers.get_fiber(lidx), fiber_array_left, self.fibers.is_right_hem[lidx], self.threshold, sigmasq) for lidx in self.fibers.index_hem) #ret = zip(*ret) left_hem_total[self.fibers.index_hem] = ret #left_hem_distance = ret[1] else: right_hem_distance = numpy.zeros( [nf, len(self.fibers.index_right_hem)]) left_hem_distance = numpy.zeros( [nf, len(self.fibers.index_left_hem)]) # compare to right hemisphere (reflect fiber first if in left hem) for lidx in self.fibers.index_hem: ret = similarity.total_similarity_for_laterality( self.fibers.get_fiber(lidx), fiber_array_right, self.fibers.is_left_hem[lidx], self.threshold, sigmasq) right_hem_total[lidx] = ret #right_hem_total[lidx] = ret[0] #right_hem_distance[lidx,:] = ret[1] # compare to left hemisphere (reflect fiber first if in right hem) for lidx in self.fibers.index_hem: ret = similarity.total_similarity_for_laterality( self.fibers.get_fiber(lidx), fiber_array_left, self.fibers.is_right_hem[lidx], self.threshold, sigmasq) left_hem_total[lidx] = ret #left_hem_distance[lidx,:] = ret[1] laterality_index = compute_laterality_index(left_hem_total, right_hem_total, self.fibers.index_hem) # output the LI as cell data in the polydata # for visualization and/or further analyses cell_data = vtk.vtkFloatArray() cell_data.SetName('Laterality') for lidx in range(0, self.fibers.number_of_fibers): cell_data.InsertNextTuple1(laterality_index[lidx]) input_vtk_polydata.GetCellData().SetScalars(cell_data) # output everything results = LateralityResults() results.laterality_index = laterality_index results.polydata = input_vtk_polydata #results.right_hem_distance = right_hem_distance #results.left_hem_distance = left_hem_distance results.sigma = self.sigma results.points_per_fiber = self.points_per_fiber results.threshold = self.threshold results.left_hem_similarity = left_hem_total results.right_hem_similarity = right_hem_total results.hemisphere = self.fibers.fiber_hemisphere return results
def compute(self, input_vtk_polydata): """ Actually calculate the laterality index for every input fiber. Input polydata is required. This polydata is modified by adding a cell data array containing laterality indices. Output from this class is a struct: <io.py> class LateralityResults Parameters in the class can also be modified for experiments: sigma (in Gaussian on inter-fiber-point distance), points_per_fiber (for parameterization), threshold (below which inter-fiber-point distance is set to 0). Performance options are the number of parallel_jobs, and verbose (whether to print progress). """ # internal representation for fast similarity computation # this also detects which hemisphere fibers are in self.fibers.points_per_fiber = self.points_per_fiber # must request hemisphere computation from object self.fibers.hemispheres = True # Now convert to array with points and hemispheres as above self.fibers.convert_from_polydata(input_vtk_polydata) # get the same number from each hemisphere if requested # ------------------------- if self.use_equal_fibers: num_fibers = min(self.fibers.number_left_hem, self.fibers.number_right_hem) if self.fibers_per_hemisphere is not None: if self.fibers_per_hemisphere <= num_fibers: num_fibers = self.fibers_per_hemisphere else: raise Exception("Fibers per hemisphere is set too high for the dataset. Current subject maximum is"+str(num_fibers)) # grab num_fibers fibers from each hemisphere. # use the first n since they were randomly sampled from the whole dataset selected_right = self.fibers.index_right_hem[0:num_fibers] selected_left = self.fibers.index_left_hem[0:num_fibers] mask = numpy.zeros(input_vtk_polydata.GetNumberOfLines()) mask[selected_right] = 1 mask[selected_left] = 1 # go back to the input data and use just those fibers input_vtk_polydata = filter.mask(input_vtk_polydata, mask) # Now convert to array with points and hemispheres as above self.fibers.convert_from_polydata(input_vtk_polydata) if self.verbose: print "<laterality.py> Using ", num_fibers , " fibers per hemisphere." # square sigma for later Gaussian sigmasq = self.sigma * self.sigma # allocate outputs nf = self.fibers.number_of_fibers laterality_index = numpy.zeros(nf) right_hem_total = numpy.zeros(nf) left_hem_total = numpy.zeros(nf) #right_hem_distance = numpy.zeros([nf, nf]) #left_hem_distance = numpy.zeros([nf, nf]) # grab all fibers from each hemisphere fiber_array_right = self.fibers.get_fibers(self.fibers.index_right_hem) fiber_array_left = self.fibers.get_fibers(self.fibers.index_left_hem) # tell user we are doing something if self.verbose: print "<laterality.py> Fibers in each hemisphere.", \ "L:", self.fibers.number_left_hem, \ "R:", self.fibers.number_right_hem, \ "/ Total:", self.fibers.number_of_fibers print "<laterality.py> Starting to compute laterality indices" # run the computation, either in parallel or not if (USE_PARALLEL & (self.parallel_jobs > 1)): if self.verbose: print "<laterality.py> Starting parallel code. Processes:", \ self.parallel_jobs # compare to right hemisphere (reflect fiber first if in left hem) ret = Parallel( n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)( delayed(similarity.total_similarity_for_laterality)( self.fibers.get_fiber(lidx), fiber_array_right, self.fibers.is_left_hem[lidx], self.threshold, sigmasq) for lidx in self.fibers.index_hem) #ret = zip(*ret) right_hem_total[self.fibers.index_hem] = ret #right_hem_distance = ret[1] # compare to left hemisphere (reflect fiber first if in right hem) ret = Parallel( n_jobs=self.parallel_jobs, verbose=self.parallel_verbose)( delayed(similarity.total_similarity_for_laterality)( self.fibers.get_fiber(lidx), fiber_array_left, self.fibers.is_right_hem[lidx], self.threshold, sigmasq) for lidx in self.fibers.index_hem) #ret = zip(*ret) left_hem_total[self.fibers.index_hem] = ret #left_hem_distance = ret[1] else: right_hem_distance = numpy.zeros([nf, len(self.fibers.index_right_hem)]) left_hem_distance = numpy.zeros([nf, len(self.fibers.index_left_hem)]) # compare to right hemisphere (reflect fiber first if in left hem) for lidx in self.fibers.index_hem: ret = similarity.total_similarity_for_laterality( self.fibers.get_fiber(lidx), fiber_array_right, self.fibers.is_left_hem[lidx], self.threshold, sigmasq) right_hem_total[lidx] = ret #right_hem_total[lidx] = ret[0] #right_hem_distance[lidx,:] = ret[1] # compare to left hemisphere (reflect fiber first if in right hem) for lidx in self.fibers.index_hem: ret = similarity.total_similarity_for_laterality( self.fibers.get_fiber(lidx), fiber_array_left, self.fibers.is_right_hem[lidx], self.threshold, sigmasq) left_hem_total[lidx] = ret #left_hem_distance[lidx,:] = ret[1] laterality_index = compute_laterality_index(left_hem_total, right_hem_total, self.fibers.index_hem) # output the LI as cell data in the polydata # for visualization and/or further analyses cell_data = vtk.vtkFloatArray() cell_data.SetName('Laterality') for lidx in range(0, self.fibers.number_of_fibers): cell_data.InsertNextTuple1(laterality_index[lidx]) input_vtk_polydata.GetCellData().SetScalars(cell_data) # output everything results = LateralityResults() results.laterality_index = laterality_index results.polydata = input_vtk_polydata #results.right_hem_distance = right_hem_distance #results.left_hem_distance = left_hem_distance results.sigma = self.sigma results.points_per_fiber = self.points_per_fiber results.threshold = self.threshold results.left_hem_similarity = left_hem_total results.right_hem_similarity = right_hem_total results.hemisphere = self.fibers.fiber_hemisphere return results
def output_and_quality_control_cluster_atlas(atlas, output_polydata_s, subject_fiber_list, input_polydatas, number_of_subjects, outdir, cluster_numbers_s, color, embed, number_of_fibers_to_display, testing=False, verbose=False, render_images=True): """Save the output in our atlas format for automatic labeling of clusters. First save the atlas.vtp and atlas.p datasets. This is the data used to label a new subject. Also write the polydata with cluster indices saved as cell data. This is a one-file output option for clusters. Finally, save some quality control metrics and save the atlas clusters as individual polydatas. This is used to set up a mrml hierarchy file and to visualize the output in Slicer. This data is not used to label a new subject. """ # Write additional output for software testing if code has changed (requires random seed to be constant) if testing: expected_results_file = os.path.join(outdir, 'test_cluster_atlas_numbers.pkl') pickle.dump(cluster_numbers_s, open(expected_results_file, 'wb')) expected_results_file = os.path.join(outdir, 'test_cluster_atlas_colors.pkl') pickle.dump(color, open(expected_results_file, 'wb')) expected_results_file = os.path.join(outdir, 'test_cluster_atlas_embeddings.pkl') pickle.dump(embed, open(expected_results_file, 'wb')) # Save the output in our atlas format for automatic labeling of full brain datasets. # This is the data used to label a new subject atlas.save(outdir,'atlas') # Write the polydata with cluster indices saved as cell data fname_output = os.path.join(outdir, 'clustered_whole_brain.vtp') io.write_polydata(output_polydata_s, fname_output) # output summary file to save information about all subjects subjects_qc_fname = os.path.join(outdir, 'input_subjects.txt') subjects_qc_file = open(subjects_qc_fname, 'w') outstr = "Subject_idx\tSubject_ID\tfilename\n" subjects_qc_file.write(outstr) idx = 1 for fname in input_polydatas: subject_id = os.path.splitext(os.path.basename(fname))[0] outstr = str(idx) + '\t' + str(subject_id) + '\t' + str(fname) + '\n' subjects_qc_file.write(outstr) idx += 1 subjects_qc_file.close() # output summary file to save information about all clusters clusters_qc_fname = os.path.join(outdir, 'cluster_quality_control.txt') clusters_qc_file = open(clusters_qc_fname, 'w') # Figure out how many subjects in each cluster (ideally, most subjects in most clusters) subjects_per_cluster = list() percent_subjects_per_cluster = list() fibers_per_cluster = list() mean_fiber_len_per_cluster = list() std_fiber_len_per_cluster = list() mean_fibers_per_subject_per_cluster = list() std_fibers_per_subject_per_cluster = list() # find out length of each fiber fiber_length, step_size = filter.compute_lengths(output_polydata_s) # loop over each cluster and compute quality control metrics cluster_indices = range(atlas.centroids.shape[0]) for cidx in cluster_indices: cluster_mask = (cluster_numbers_s==cidx) subjects_per_cluster.append(len(set(subject_fiber_list[cluster_mask]))) fibers_per_subject = list() for sidx in range(number_of_subjects): fibers_per_subject.append(list(subject_fiber_list[cluster_mask]).count(sidx)) mean_fibers_per_subject_per_cluster.append(numpy.mean(numpy.array(fibers_per_subject))) std_fibers_per_subject_per_cluster.append(numpy.std(numpy.array(fibers_per_subject))) mean_fiber_len_per_cluster.append(numpy.mean(fiber_length[cluster_mask])) std_fiber_len_per_cluster.append(numpy.std(fiber_length[cluster_mask])) percent_subjects_per_cluster = numpy.divide(numpy.array(subjects_per_cluster),float(number_of_subjects)) # Save output quality control information print "<cluster.py> Saving cluster quality control information file." clusters_qc_file = open(clusters_qc_fname, 'w') print >> clusters_qc_file, 'cluster_idx','\t', 'number_subjects','\t', 'percent_subjects','\t', 'mean_length','\t', 'std_length','\t', 'mean_fibers_per_subject','\t', 'std_fibers_per_subject' for cidx in cluster_indices: print >> clusters_qc_file, cidx + 1,'\t', subjects_per_cluster[cidx],'\t', percent_subjects_per_cluster[cidx] * 100.0,'\t', \ mean_fiber_len_per_cluster[cidx],'\t', std_fiber_len_per_cluster[cidx],'\t', \ mean_fibers_per_subject_per_cluster[cidx],'\t', std_fibers_per_subject_per_cluster[cidx] clusters_qc_file.close() if HAVE_PLT: print "<cluster.py> Saving subjects per cluster histogram." fig, ax = plt.subplots() counts = numpy.zeros(num_of_subjects+1) counts[:numpy.max(subjects_per_cluster)+1] = numpy.bincount(subjects_per_cluster) ax.bar(range(num_of_subjects + 1), counts, width=1, align='center') ax.set(xlim=[-1, num_of_subjects + 1]) plt.title('Histogram of Subjects per Cluster') plt.xlabel('subjects per cluster') plt.ylabel('number of clusters') plt.savefig( os.path.join(outdir, 'subjects_per_cluster_hist.pdf')) plt.close() # Save the entire combined atlas as individual clusters for visualization # and labeling/naming of structures. This will include all of the data # that was clustered to make the atlas. # Figure out file name and mean color for each cluster, and write the individual polydatas print "<cluster.py> Beginning to save individual clusters as polydata files. TOTAL CLUSTERS:", len(cluster_indices), fnames = list() cluster_colors = list() cluster_sizes = list() cluster_fnames = list() for c in cluster_indices: print c, mask = cluster_numbers_s == c cluster_size = numpy.sum(mask) cluster_sizes.append(cluster_size) pd_c = filter.mask(output_polydata_s, mask,verbose=verbose) # color by subject so we can see which one it came from filter.add_point_data_array(pd_c, subject_fiber_list[mask], "Subject_ID") # Save hemisphere information into the polydata farray = fibers.FiberArray() farray.hemispheres = True farray.hemisphere_percent_threshold = 0.90 farray.convert_from_polydata(pd_c, points_per_fiber=50) filter.add_point_data_array(pd_c, farray.fiber_hemisphere, "Hemisphere") # The clusters are stored starting with 1, not 0, for user friendliness. fname_c = 'cluster_{0:05d}.vtp'.format(c+1) # save the filename for writing into the MRML file fnames.append(fname_c) # prepend the output directory fname_c = os.path.join(outdir, fname_c) #print fname_c io.write_polydata(pd_c, fname_c) cluster_fnames.append(fname_c) if cluster_size > 0: color_c = color[mask,:] cluster_colors.append(numpy.mean(color_c,0)) else: cluster_colors.append([0,0,0]) del pd_c print "\n<cluster.py> Finishes saving individual clusters as polydata files." # Notify user if some clusters empty empty_count = 0 for sz, fname in zip(cluster_sizes,cluster_fnames): if sz == 0: print sz, ":", fname empty_count += 1 if empty_count: print "<cluster.py> Warning. Empty clusters found:", empty_count cluster_sizes = numpy.array(cluster_sizes) print "<cluster.py> Mean number of fibers per cluster:", numpy.mean(cluster_sizes), "Range:", numpy.min(cluster_sizes), "..", numpy.max(cluster_sizes) # Estimate subsampling ratio to display approximately number_of_fibers_to_display total fibers in 3D Slicer number_fibers = len(cluster_numbers_s) if number_fibers < number_of_fibers_to_display: ratio = 1.0 else: ratio = number_of_fibers_to_display / number_fibers print "<cluster.py> Subsampling ratio for display of", number_of_fibers_to_display, "total fibers estimated as:", ratio # Write the MRML file into the directory where the polydatas were already stored fname = os.path.join(outdir, 'clustered_tracts.mrml') mrml.write(fnames, numpy.around(numpy.array(cluster_colors), decimals=3), fname, ratio=ratio) # Also write one with 100% of fibers displayed fname = os.path.join(outdir, 'clustered_tracts_display_100_percent.mrml') mrml.write(fnames, numpy.around(numpy.array(cluster_colors), decimals=3), fname, ratio=1.0) # View the whole thing in jpg format for quality control if render_images: print '<cluster.py> Rendering and saving images of cluster atlas.' ren = render.render(output_polydata_s, 1000, data_mode='Cell', data_name='EmbeddingColor', verbose=verbose) ren.save_views(outdir, verbose=verbose) del ren
def spectral(input_polydata, number_of_clusters=200, number_of_eigenvectors=20, sigma=60, threshold=0.0, number_of_jobs=3, use_nystrom=False, nystrom_mask=None, landmarks=None, distance_method='Mean', normalized_cuts=True, outlier_std_threshold = 2.0, pos_def_approx=True, bilateral=False): """ Spectral clustering based on pairwise fiber affinity matrix. As in O'Donnell and Westin TMI 2007. Differences from that implementation: fiber point correspondences are defined using fixed-length fiber parameterization (instead of closest point). """ # test pd has lines first number_fibers = input_polydata.GetNumberOfLines() print "<cluster.py> Starting spectral clustering." print "<cluster.py> Number of input fibers:", number_fibers print "<cluster.py> Number of clusters:", number_of_clusters if number_fibers == 0: print "<cluster.py> ERROR: Cannot cluster polydata with 0 fibers." return atlas = ClusterAtlas() # Store all parameters to this function. They must be identical later to label new data. # Below, calculated values will also be stored in the atlas. atlas.number_of_eigenvectors = number_of_eigenvectors atlas.sigma = sigma atlas.threshold = threshold atlas.use_nystrom = use_nystrom atlas.landmarks = landmarks atlas.distance_method = distance_method atlas.bilateral = bilateral # 1) Compute fiber similarities. # Nystrom version of the code uses a sample of the data. if use_nystrom: # make sure it's an array for logic operations nystrom_mask = numpy.array(nystrom_mask) # make sure it's boolean or 0 and 1 test = numpy.max(nystrom_mask) == 1.0 if not test: print "<cluster.py> ERROR: Nystrom data mask is may not be Boolean. Max value is not 1.0/True." raise AssertionError # make sure it's large enough test = sum(nystrom_mask) >= 100 if not test: print "<cluster.py> ERROR: Nystrom data mask is smaller than 100." raise AssertionError # make sure its size matches the polydata input test = len(nystrom_mask) == number_fibers if not test: print "<cluster.py> ERROR: Nystrom data mask size does not match polydata number of lines." raise AssertionError # Separate the Nystrom sample and the rest of the data. polydata_m = filter.mask(input_polydata, nystrom_mask, verbose=False) atlas.nystrom_polydata = polydata_m polydata_n = filter.mask(input_polydata, nystrom_mask == False, verbose=False) sz = polydata_m.GetNumberOfLines() print '<cluster.py> Using Nystrom approximation. Subset size:', sz, '/', number_fibers # Determine ordering to get embedding to correspond to original input data. reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask)[0], numpy.where(~nystrom_mask)[0])) if landmarks is not None: landmarks_m = landmarks[nystrom_mask,:,:] landmarks_n = landmarks[~nystrom_mask,:,:] else: landmarks_m = landmarks_n = None # Calculate fiber similarities A = \ _pairwise_similarity_matrix(polydata_m, threshold, sigma, number_of_jobs, landmarks_m, distance_method, bilateral) B = \ _rectangular_similarity_matrix(polydata_n, polydata_m, threshold, sigma, number_of_jobs, landmarks_n, landmarks_m, distance_method, bilateral) # sanity check print "<cluster.py> Range of values in A:", numpy.min(A), numpy.max(A) print "<cluster.py> Range of values in B:", numpy.min(B), numpy.max(B) else: # Calculate all fiber similarities A = \ _pairwise_similarity_matrix(input_polydata, threshold, sigma, number_of_jobs, landmarks, distance_method, bilateral) atlas.nystrom_polydata = input_polydata # sanity check print "<cluster.py> Range of values in A:", numpy.min(A), numpy.max(A) testval = numpy.max(A-A.T) if not testval == 0.0: if testval > 1e-10: print "<cluster.py> ERROR: A matrix is not symmetric." raise AssertionError else: print "<cluster.py> Maximum of A - A^T:", testval # Ensure that A is symmetric A = numpy.divide(A+A.T, 2.0) testval = numpy.min(A) if not testval > 0.0: print "<cluster.py> ERROR: A matrix is not positive." print "<cluster.py> Minimum value in A: ", testval if testval < 0.0: raise AssertionError # Outliers will have low measured (or estimated) row sums. Detect outliers in A: # to turn off for testing: outlier_std_threshold = numpy.inf row_sum_A_initial = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0) print "<cluster.py> Initial similarity (row) sum A:", numpy.mean(row_sum_A_initial), numpy.std(row_sum_A_initial), numpy.min(row_sum_A_initial) atlas.outlier_std_threshold = outlier_std_threshold atlas.row_sum_threshold_for_rejection = numpy.mean(row_sum_A_initial) - outlier_std_threshold*numpy.std(row_sum_A_initial) bad_idx = numpy.nonzero(row_sum_A_initial < atlas.row_sum_threshold_for_rejection)[0] reject_A = bad_idx print "<cluster.py> Rejecting n=", len(bad_idx), "/", sz, "fibers >", outlier_std_threshold, "standard deviations below the mean total fiber similarity" A = numpy.delete(A,reject_A,0) A = numpy.delete(A,reject_A,1) #print A.shape, B.shape B = numpy.delete(B,reject_A,0) #print A.shape, B.shape, reorder_embedding.shape # Ensure that A is positive definite. if pos_def_approx: e_val, e_vec = numpy.linalg.eigh(A) print "<cluster.py> Eigenvalue range of A:", e_val[0], e_val[-1] A2 = nearPSD(A) e_val, e_vec = numpy.linalg.eigh(A2) print "<cluster.py> Eigenvalue range of nearest PSD matrix to A:", e_val[0], e_val[-1] testval = numpy.max(A-A2) if not testval == 0.0: print "<cluster.py> A matrix differs by PSD matrix by maximum of:", testval if testval > 0.25: print "<cluster.py> ERROR: A matrix changed by more than 0.25." raise AssertionError A = A2 # 2) Do Normalized Cuts transform of similarity matrix. # See the paper: "Spectral Grouping Using the Nystrom Method" # (D^-1/2 W D^-1/2) V = V Lambda if normalized_cuts: if use_nystrom: # Form of entire affinity matrix: # A B # B^T C # C is not computed. # Calculate the sum of the partial rows we've computed: atlas.row_sum_1 = numpy.sum(A, axis=0) + numpy.sum(B.T, axis=0) #print "<cluster.py> A size:", A.shape #print "<cluster.py> B size:", B.shape #print "<cluster.py> A-B matrix row sums range (should be > 0):", numpy.min(atlas.row_sum_1), numpy.max(atlas.row_sum_1) # Approximate the sum of the rest of the data (including C) # These are weighted sums of the columns we did compute # where the weight depends on how similar that fiber # was to each path in A. This uses the dual basis # of the columns in A. # Approximate the inverse of A for dual basis #print "<cluster.py> Using numpy linalg pinv A" atlas.pinv_A = numpy.linalg.pinv(A) #e_val, e_vec = numpy.linalg.eigh(atlas.pinv_A) #print "<cluster.py> test of non-normalized A pseudoinverse Eigenvalue range:", e_val[0], e_val[-1] # row sum formula: # dhat = [a_r + b_r; b_c + B^T*A-1*b_r] # this matrix is A^-1 * b_r, where b_r are the row sums of B # matlab was: atlas.approxRowSumMatrix = sum(B',1)*atlas.pseudoInverseA; atlas.row_sum_matrix = numpy.dot(numpy.sum(B.T, axis=0), atlas.pinv_A) #test = numpy.sum(B.T, axis=0) #print "<cluster.py> B column sums range (should be > 0):", numpy.min(test), numpy.max(test) print "<cluster.py> Range of row sum weights:", numpy.min(atlas.row_sum_matrix), numpy.max(atlas.row_sum_matrix) #print "<cluster.py> First 10 entries in weight matrix:", atlas.row_sum_matrix[0:10] #test = numpy.dot(atlas.row_sum_matrix, B) #print "<cluster.py> Test partial sum estimation for B:", numpy.min(test), numpy.max(test) #del test # row sum estimate for current B part of the matrix row_sum_2 = numpy.sum(B, axis=0) + \ numpy.dot(atlas.row_sum_matrix, B) print "<cluster.py> Row sum check (min/max, should be > 0) A:", numpy.min(atlas.row_sum_1), numpy.median(atlas.row_sum_1), numpy.max(atlas.row_sum_1), "B:", numpy.min(row_sum_2), numpy.median(row_sum_2), numpy.max(row_sum_2) # reject outliers in B bad_idx = numpy.nonzero(row_sum_2 < atlas.row_sum_threshold_for_rejection)[0] reject_B = bad_idx print "<cluster.py> Rejecting n=", len(bad_idx), "/", B.shape[1], "fibers >", outlier_std_threshold, "standard deviations below the mean total fiber similarity" row_sum_2 = numpy.delete(row_sum_2,reject_B) B = numpy.delete(B,reject_B,1) print "<cluster.py> After outlier rejection A:", A.shape, "B:", B.shape print "<cluster.py> Row sum check (min/max, should be > 0) A:", numpy.min(atlas.row_sum_1), numpy.median(atlas.row_sum_1), numpy.max(atlas.row_sum_1), "B:", numpy.min(row_sum_2), numpy.median(row_sum_2), numpy.max(row_sum_2) # Separate the Nystrom sample and the rest of the data after removing outliers nystrom_mask_2 = nystrom_mask midxA = numpy.nonzero(nystrom_mask_2)[0] nystrom_mask_2[midxA[reject_A]] = False not_nystrom_mask = nystrom_mask == False not_nystrom_mask[midxA[reject_A]] = False midxB = numpy.nonzero(not_nystrom_mask)[0] not_nystrom_mask[midxB[reject_B]] = False polydata_m = filter.mask(input_polydata, nystrom_mask_2, verbose=False) atlas.nystrom_polydata = polydata_m polydata_n = filter.mask(input_polydata, not_nystrom_mask, verbose=False) output_polydata = filter.mask(input_polydata, numpy.add(nystrom_mask_2, not_nystrom_mask),verbose=False) sz = polydata_m.GetNumberOfLines() number_fibers = output_polydata.GetNumberOfLines() print '<cluster.py> Using Nystrom approximation. Subset size (A):', sz, '/', number_fibers, "B:", polydata_n.GetNumberOfLines() # Determine ordering to get embedding to correspond to original input data. # reject outliers from masks reject_idx = numpy.concatenate((midxA[reject_A],midxB[reject_B])) nystrom_mask_2 = numpy.delete(nystrom_mask_2,reject_idx) not_nystrom_mask = numpy.delete(not_nystrom_mask,reject_idx) #print "hi after mask:", reorder_embedding.shape, numpy.sum(nystrom_mask_2), numpy.sum(not_nystrom_mask) reorder_embedding = numpy.concatenate((numpy.where(nystrom_mask_2)[0], numpy.where(not_nystrom_mask)[0])) #print "hi after embed reorder calc:", reorder_embedding.shape, numpy.max(reorder_embedding), numpy.min(reorder_embedding) # in case of negative row sum estimation if any(row_sum_2<=0): print "<cluster.py> Warning: Consider increasing sigma or using the Mean distance. negative row sum approximations." print "Number of negative row sums:", numpy.count_nonzero(row_sum_2<=0) #row_sum_2[row_sum_2<0] = 0.1 # save for testing column_sum = numpy.concatenate((numpy.sum(A, axis=1) , numpy.sum(B.T, axis=1))) # normalized cuts normalization row_sum = numpy.concatenate((atlas.row_sum_1, row_sum_2)) dhat = numpy.sqrt(numpy.divide(1, row_sum)) #dhat = numpy.sqrt(numpy.divide(1, numpy.concatenate((atlas.row_sum_1, row_sum_2)))) A = \ numpy.multiply(A, numpy.outer(dhat[0:sz], dhat[0:sz].T)) B = \ numpy.multiply(B, numpy.outer(dhat[0:sz], dhat[sz:].T)) else: # normalized cuts normalization using row (same as column) sums row_sum = numpy.sum(A, axis=0) print "<cluster.py> A matrix row sums range (should be > 0):", numpy.min(row_sum), numpy.max(row_sum) dhat = numpy.divide(1, numpy.sqrt(row_sum)) A = \ numpy.multiply(A, numpy.outer(dhat, dhat.T)) # 3) Compute eigenvectors for use in spectral embedding print '<cluster.py> Calculating eigenvectors of similarity matrix A...' atlas.e_val, atlas.e_vec = numpy.linalg.eigh(A) print '<cluster.py> Done calculating eigenvectors.' print "<cluster.py> Eigenvalue range:", atlas.e_val[0], atlas.e_val[-1] # Check how well our chosen number of eigenvectors models the data power = numpy.cumsum(atlas.e_val[::-1]) / numpy.sum(atlas.e_val) print "<cluster.py> Power from chosen number of eigenvectors (", number_of_eigenvectors, ')', power[number_of_eigenvectors] print '<cluster.py> Top eigenvalues:', atlas.e_val[::-1][1:number_of_eigenvectors] # 4) Compute embedding using eigenvectors print('<cluster.py> Compute embedding using eigenvectors.') if use_nystrom: # Create embedding vectors using nystrom approximation to find # the approximate top eigenvectors of the matrix # L = D^(-1/2) (D - W) D^(-1/2) # See the paper: # "Spectral Grouping Using the Nystrom Method" # Basically all this does is adds in the extra measurements # by projecting them onto the original eigenvector basis. # A=UVU' => U = AUV^-1 => take new rows of extended A (B') and do # the same. U' = [AUV^-1 ; B'UV^-1] = [U ; B'UV^-1] # Note they divide embedding by 1st eigenvector rather # than sqrt of row sum, as in this code (below). # matlab was: % project onto eigenvectors of A: # % v' = [v ; B'*v*d^-1 # V = [atlas.eigenvectA; B'*atlas.eigenvectA*(diag(1./diag(atlas.eigenvalA)))]; V = numpy.concatenate((atlas.e_vec, \ numpy.dot(numpy.dot(B.T, atlas.e_vec), \ numpy.diag(numpy.divide(1.0, atlas.e_val))))) # normalize estimated eigenvectors to have length of one # matlab was: # atlas.eigenvectorLengthToNormalize=sqrt(sum(V.*V)); # V=V./repmat(atlas.eigenvectorLengthToNormalize,length(V),1); atlas.e_vec_norm = numpy.sum(numpy.multiply(V, V),0) V = numpy.divide(V, atlas.e_vec_norm) # Normalize each embedding vector by first eigenvector. Matlab code was: # for i = 2:embedLength+1 # embedding(:,i-1) = V(:,i)./V(:,1); # end # This eigenvector corresponds to an eigenvalue of 1, since row sums are 1. # The other option from the literature was to use this: # embedding_i,j = V_i+i,j./sqrt(D_j,j) embed = numpy.zeros((number_fibers, number_of_eigenvectors)) #print "Hi 3:", embed.shape, number_fibers, reorder_embedding.shape, V.shape, A.shape, B.shape for i in range(0, number_of_eigenvectors): embed[reorder_embedding,i] = numpy.divide(V[:,-(i+2)], V[:,-1]) else: embed = atlas.e_vec[:, -number_of_eigenvectors - 2: -2] embed = numpy.divide(embed.T, atlas.e_vec[:, -1]).T # reverse order of embedding so highest eigenvalue # information is first embed = embed[:, ::-1] # Default is always k-means. Other code is just left for testing. Did not improve results. #centroid_finder = 'AffinityPropagation' centroid_finder = 'K-means' # 5) Find clusters using k-means in embedding space. cluster_metric = None if centroid_finder == 'K-means': print '<cluster.py> K-means clustering in embedding space.' centroids, cluster_metric = scipy.cluster.vq.kmeans2(embed, number_of_clusters, minit='points') # sort centroids by first eigenvector order # centroid_order = numpy.argsort(centroids[:,0]) # sort centroids according to colormap and save them in this order in atlas color = _embed_to_rgb(centroids) centroid_order = render.argsort_by_jet_lookup_table(color) atlas.centroids = centroids[centroid_order,:] cluster_idx, dist = scipy.cluster.vq.vq(embed, atlas.centroids) #print "<cluster.py> Distortion metric:", cluster_metric if 0: # This is extremely slow, but leave code here if ever wanted for testing cluster_metric = metrics.silhouette_score(embed, cluster_idx, metric='sqeuclidean') print("Silhouette Coefficient: %0.3f" % cluster_metric) else: print "ERROR: Unknown centroid finder", centroid_finder ## # This found fewer clusters than we need to represent the anatomy well ## # Leave code here in case wanted in future for more testing. ## print '<cluster.py> Affinity Propagation clustering in embedding space.' ## af = AffinityPropagation(preference=-50).fit(embed) ## cluster_centers_indices = af.cluster_centers_indices_ ## labels = af.labels_ ## n_clusters_ = len(cluster_centers_indices) ## print('Estimated number of clusters: %d' % n_clusters_) ## cluster_idx = labels ## for k in range(n_clusters_): ## class_members = labels == k ## atlas.centroids = embed[cluster_centers_indices[k]] ## # return metrics ## if 0: ## # This is extremely slow, but leave code here if ever wanted for testing ## cluster_metric = metrics.silhouette_score(embed, labels, metric='sqeuclidean') ## print("Silhouette Coefficient: %0.3f" % cluster_metric) # 6) Output results. print '<cluster.py> Done spectral clustering, returning results.' # visualize embedding coordinates as RGB embed2 = embed #embed2[numpy.isnan(embed)] = 0.0 color = _embed_to_rgb(embed2) # set up polydata with clustering output info. # for now modify input polydata by adding two arrays #output_polydata = input_polydata if use_nystrom: output_polydata = \ _format_output_polydata(output_polydata, cluster_idx, color, embed, row_sum[reorder_embedding], column_sum[reorder_embedding]) else: # row and column sums are the same. no need to reorder. output_polydata = \ _format_output_polydata(output_polydata, cluster_idx, color, embed, row_sum, row_sum) return output_polydata, cluster_idx, color, embed, cluster_metric, atlas, reject_idx