def __init__(self, **kwargs): group_args.__init__(self, **kwargs) # require Dij, d_c P = Profiler("2. calculate rho density") print("finished Dij, now calculating rho_i, the density") from xfel.clustering import Rodriguez_Laio_clustering_2014 R = Rodriguez_Laio_clustering_2014(distance_matrix=self.Dij, d_c=self.d_c) self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] print("The average rho_i is %5.2f, or %4.1f%%" % (ave_rho, 100 * ave_rho / NN)) i_max = flex.max_index(rho) P = Profiler("3.transition") print("the index with the highest density is %d" % (i_max)) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in range(NN)])) print("delta_i_max", delta_i_max) rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) P = Profiler("4. delta") self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) P = Profiler("5. find cluster maxima") #---- Now hunting for clusters cluster_id = flex.int(NN, -1) # default -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) N_CLUST = 10 # maximum of 10 points to be considered as possible clusters MAX_PERCENTILE_DELTA = 0.10 # cluster centers have to be in the top 10% percentile delta MAX_PERCENTILE_RHO = 0.75 # cluster centers have to be in the top 75% percentile rho n_cluster = 0 max_n_delta = min(N_CLUST, int(MAX_PERCENTILE_DELTA * NN)) for ic in range(max_n_delta): # test the density, rho item_idx = delta_order[ic] if delta[item_idx] < 0.25 * delta[ delta_order[0]]: # too low (another heuristic!) continue item_rho_order = rho_order_list.index(item_idx) if item_rho_order / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print(ic, item_idx, item_rho_order, cluster_id[item_idx]) n_cluster += 1 print("Found %d clusters" % n_cluster) for x in range(NN): if cluster_id[x] >= 0: print("XC", x, cluster_id[x], rho[x], delta[x]) self.cluster_id_maxima = cluster_id.deep_copy() P = Profiler("6. assign all points") R.cluster_assignment(rho_order, cluster_id) self.cluster_id_full = cluster_id.deep_copy() # assign the halos P = Profiler("7. assign halos") halo = flex.bool(NN, False) border = R.get_border(cluster_id=cluster_id) for ic in range(n_cluster ): #loop thru all border regions; find highest density print("cluster", ic, "in border", border.count(True)) this_border = (cluster_id == ic) & (border == True) print(len(this_border), this_border.count(True)) if this_border.count(True) > 0: highest_density = flex.max(rho.select(this_border)) halo_selection = (rho < highest_density) & (this_border == True) if halo_selection.count(True) > 0: cluster_id.set_selected(halo_selection, -1) core_selection = (cluster_id == ic) & ~halo_selection highest_density = flex.max(rho.select(core_selection)) too_sparse = core_selection & ( rho.as_double() < highest_density / 10. ) # another heuristic if too_sparse.count(True) > 0: cluster_id.set_selected(too_sparse, -1) self.cluster_id_final = cluster_id.deep_copy() print("%d in the excluded halo" % ((cluster_id == -1).count(True)))
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) mandatory = ["ORI", "MILLER", "BEAM", "WAVE", "ICALCVEC", "IOBSVEC"] for key in mandatory: getattr(self, key) self.DSSQ = self.ORI.unit_cell().d_star_sq(self.MILLER)
def __init__(self, **kwargs): group_args.__init__(self,**kwargs) mandatory = ["ORI","MILLER","BEAM","WAVE","ICALCVEC","IOBSVEC"] for key in mandatory: getattr(self,key) self.DSSQ = self.ORI.unit_cell().d_star_sq(self.MILLER)
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) print('finished Dij, now calculating rho_i and density') from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL R = RL(distance_matrix=self.Dij, d_c=self.d_c) #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True) if hasattr(self, 'strategy') is False: self.strategy = 'default' self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] i_max = flex.max_index(rho) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in range(NN)])) rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) cluster_id = flex.int(NN, -1) # -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) MAX_PERCENTILE_RHO = self.max_percentile_rho # cluster centers have to be in the top percentile n_cluster = 0 # # print('Z_DELTA = ', self.Z_delta) pick_top_solution = False rho_stdev = flex.mean_and_variance( rho.as_double()).unweighted_sample_standard_deviation() delta_stdev = flex.mean_and_variance( delta).unweighted_sample_standard_deviation() if rho_stdev != 0.0 and delta_stdev != 0: rho_z = (rho.as_double() - flex.mean(rho.as_double())) / (rho_stdev) delta_z = (delta - flex.mean(delta)) / (delta_stdev) else: pick_top_solution = True if rho_stdev == 0.0: centroids = [flex.first_index(delta, flex.max(delta))] elif delta_stdev == 0.0: centroids = [flex.first_index(rho, flex.max(rho))] significant_delta = [] significant_rho = [] # Define strategy to decide cluster center here. Only one should be true debug_fix_clustering = True if self.strategy == 'one_cluster': debug_fix_clustering = False strategy2 = True if self.strategy == 'strategy_3': debug_fix_clustering = False strategy3 = True strategy2 = False if debug_fix_clustering: if not pick_top_solution: delta_z_cutoff = min(1.0, max(delta_z)) rho_z_cutoff = min(1.0, max(rho_z)) for ic in range(NN): # test the density & rho if delta_z[ic] >= delta_z_cutoff or delta_z[ ic] <= -delta_z_cutoff: significant_delta.append(ic) if rho_z[ic] >= rho_z_cutoff or rho_z[ic] <= -rho_z_cutoff: significant_rho.append(ic) if True: # Use idea quoted in Rodriguez Laio 2014 paper # " Thus, cluster centers are recognized as points for which the value of delta is anomalously large." centroid_candidates = list(significant_delta) candidate_delta_z = flex.double() for ic in centroid_candidates: if ic == rho_order[0]: delta_z_of_rho_order_0 = delta_z[ic] candidate_delta_z.append(delta_z[ic]) i_sorted = flex.sort_permutation(candidate_delta_z, reverse=True) # Check that once sorted the top one is not equal to the 2nd or 3rd position # If there is a tie, assign centroid to the first one in rho order centroids = [] # rho_order[0] has to be a centroid centroids.append(rho_order[0]) #centroids.append(centroid_candidates[i_sorted[0]]) for i in range(0, len(i_sorted[:])): if centroid_candidates[i_sorted[i]] == rho_order[0]: continue if delta_z_of_rho_order_0 - candidate_delta_z[ i_sorted[i]] > 1.0: if i > 1: if -candidate_delta_z[i_sorted[ i - 1]] + candidate_delta_z[ i_sorted[0]] > 1.0: centroids.append( centroid_candidates[i_sorted[i]]) else: centroids.append( centroid_candidates[i_sorted[i]]) else: break if False: centroid_candidates = list( set(significant_delta).intersection( set(significant_rho))) # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev centroids = [] max_delta_z_candidates = -999.9 max_rho_z_candidates = -999.9 for ic in centroid_candidates: if delta_z[ic] > max_delta_z_candidates: max_delta_z_candidates = delta_z[ic] if rho_z[ic] > max_rho_z_candidates: max_rho_z_candidates = rho_z[ic] for ic in centroid_candidates: if max_delta_z_candidates - delta_z[ ic] < 1.0 and max_rho_z_candidates - rho_z[ ic] < 1.0: centroids.append(ic) #item_idxs = [delta_order[ic] for ic,centroid in enumerate(centroids)] item_idxs = centroids for item_idx in item_idxs: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 #### elif strategy2: # Go through list of clusters, see which one has highest joint rank in both rho and delta lists # This will only assign one cluster center based on highest product of rho and delta ranks product_list_of_ranks = [] for ic in range(NN): rho_tmp = self.rho[ic] delta_tmp = self.delta[ic] product_list_of_ranks.append(rho_tmp * delta_tmp) import numpy as np item_idx = np.argmax(product_list_of_ranks) cluster_id[item_idx] = n_cluster # Only cluster assigned print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 elif strategy3: # use product of delta and rho and pick out top candidates # have to use a significance z_score to filter out the very best product_list_of_ranks = flex.double() for ic in range(NN): rho_tmp = self.rho[ic] delta_tmp = self.delta[ic] product_list_of_ranks.append(rho_tmp * delta_tmp) import numpy as np iid_sorted = flex.sort_permutation(product_list_of_ranks, reverse=True) cluster_id[ iid_sorted[0]] = n_cluster # first point always a cluster n_cluster += 1 print('CLUSTERING_STATS S3', iid_sorted[0], cluster_id[iid_sorted[0]]) #product_list_of_ranks[iid_sorted[0]]=0.0 # set this to 0.0 so that the mean/stdev does not get biased by one point stdev = np.std(product_list_of_ranks) mean = np.mean(product_list_of_ranks) n_sorted = 3 #if stdev == 0.0: # n_sorted=1 z_critical = 3.0 # 2 sigma significance ? # Only go through say 3-4 datapoints # basically there won't be more than 2-3 lattices on an image realistically for iid in iid_sorted[1:n_sorted]: z_score = (product_list_of_ranks[iid] - mean) / stdev if z_score > z_critical: cluster_id[iid] = n_cluster n_cluster += 1 print('CLUSTERING_STATS S3', iid, cluster_id[iid]) else: break # No point going over all points once below threshold z_score else: for ic in range(NN): item_idx = delta_order[ic] if ic != 0: if delta[item_idx] <= 0.25 * delta[ delta_order[0]]: # too low to be a medoid continue item_rho_order = rho_order_list.index(item_idx) if (item_rho_order) / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', ic, item_idx, item_rho_order, cluster_id[item_idx]) n_cluster += 1 ### # print('Found %d clusters' % n_cluster) for x in range(NN): if cluster_id[x] >= 0: print("XC", x, cluster_id[x], rho[x], delta[x]) self.cluster_id_maxima = cluster_id.deep_copy() R.cluster_assignment(rho_order, cluster_id, rho) self.cluster_id_full = cluster_id.deep_copy() #halo = flex.bool(NN,False) #border = R.get_border( cluster_id = cluster_id ) #for ic in range(n_cluster): #loop thru all border regions; find highest density # this_border = (cluster_id == ic) & (border==True) # if this_border.count(True)>0: # highest_density = flex.max(rho.select(this_border)) # halo_selection = (rho < highest_density) & (this_border==True) # if halo_selection.count(True)>0: # cluster_id.set_selected(halo_selection,-1) # core_selection = (cluster_id == ic) & ~halo_selection # highest_density = flex.max(rho.select(core_selection)) # too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic # if too_sparse.count(True)>0: # cluster_id.set_selected(too_sparse,-1) self.cluster_id_final = cluster_id.deep_copy()
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) print('finished Dij, now calculating rho_i and density') from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL R = RL(distance_matrix=self.Dij, d_c=self.d_c) #from IPython import embed; embed(); exit() #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True) self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] i_max = flex.max_index(rho) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in range(NN)])) rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) cluster_id = flex.int(NN, -1) # -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) MAX_PERCENTILE_RHO = self.max_percentile_rho # cluster centers have to be in the top percentile n_cluster = 0 # pick_top_solution = False rho_stdev = flex.mean_and_variance( rho.as_double()).unweighted_sample_standard_deviation() delta_stdev = flex.mean_and_variance( delta).unweighted_sample_standard_deviation() if rho_stdev != 0.0 and delta_stdev != 0: rho_z = (rho.as_double() - flex.mean(rho.as_double())) / (rho_stdev) delta_z = (delta - flex.mean(delta)) / (delta_stdev) else: pick_top_solution = True if rho_stdev == 0.0: centroids = [flex.first_index(delta, flex.max(delta))] elif delta_stdev == 0.0: centroids = [flex.first_index(rho, flex.max(rho))] significant_delta = [] significant_rho = [] debug_fix_clustering = True if debug_fix_clustering: if not pick_top_solution: delta_z_cutoff = min(1.0, max(delta_z)) rho_z_cutoff = min(1.0, max(rho_z)) for ic in range(NN): # test the density & rho if delta_z[ic] >= delta_z_cutoff: significant_delta.append(ic) if rho_z[ic] >= rho_z_cutoff: significant_rho.append(ic) centroid_candidates = list( set(significant_delta).intersection(set(significant_rho))) # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev centroids = [] max_delta_z_candidates = -999.9 max_rho_z_candidates = -999.9 for ic in centroid_candidates: if delta_z[ic] > max_delta_z_candidates: max_delta_z_candidates = delta_z[ic] if rho_z[ic] > max_rho_z_candidates: max_rho_z_candidates = rho_z[ic] for ic in centroid_candidates: if max_delta_z_candidates - delta_z[ ic] < 1.0 and max_rho_z_candidates - rho_z[ ic] < 1.0: centroids.append(ic) item_idxs = [ delta_order[ic] for ic, centroid in enumerate(centroids) ] for item_idx in item_idxs: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', item_idx, cluster_id[item_idx]) n_cluster += 1 #### else: for ic in range(NN): item_idx = delta_order[ic] if ic != 0: if delta[item_idx] <= 0.25 * delta[ delta_order[0]]: # too low to be a medoid continue item_rho_order = rho_order_list.index(item_idx) if (item_rho_order) / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print('CLUSTERING_STATS', ic, item_idx, item_rho_order, cluster_id[item_idx]) n_cluster += 1 ### # # print('Found %d clusters' % n_cluster) for x in range(NN): if cluster_id[x] >= 0: print("XC", x, cluster_id[x], rho[x], delta[x]) self.cluster_id_maxima = cluster_id.deep_copy() R.cluster_assignment(rho_order, cluster_id) self.cluster_id_full = cluster_id.deep_copy() #halo = flex.bool(NN,False) #border = R.get_border( cluster_id = cluster_id ) #for ic in range(n_cluster): #loop thru all border regions; find highest density # this_border = (cluster_id == ic) & (border==True) # if this_border.count(True)>0: # highest_density = flex.max(rho.select(this_border)) # halo_selection = (rho < highest_density) & (this_border==True) # if halo_selection.count(True)>0: # cluster_id.set_selected(halo_selection,-1) # core_selection = (cluster_id == ic) & ~halo_selection # highest_density = flex.max(rho.select(core_selection)) # too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic # if too_sparse.count(True)>0: # cluster_id.set_selected(too_sparse,-1) self.cluster_id_final = cluster_id.deep_copy()
def __init__(self, **kwargs): group_args.__init__(self, **kwargs) # require Dij, d_c P = Profiler("2. calculate rho density") print "finished Dij, now calculating rho_i, the density" from xfel.clustering import Rodriguez_Laio_clustering_2014 # alternative clustering algorithms: see http://scikit-learn.org/stable/modules/clustering.html # also see https://cran.r-project.org/web/packages/dbscan/vignettes/hdbscan.html # see also https://en.wikipedia.org/wiki/Hausdorff_dimension R = Rodriguez_Laio_clustering_2014(distance_matrix=self.Dij, d_c=self.d_c) self.rho = rho = R.get_rho() ave_rho = flex.mean(rho.as_double()) NN = self.Dij.focus()[0] print "The average rho_i is %5.2f, or %4.1f%%" % (ave_rho, 100 * ave_rho / NN) i_max = flex.max_index(rho) P = Profiler("3.transition") print "the index with the highest density is %d" % (i_max) delta_i_max = flex.max( flex.double([self.Dij[i_max, j] for j in xrange(NN)])) print "delta_i_max", delta_i_max rho_order = flex.sort_permutation(rho, reverse=True) rho_order_list = list(rho_order) P = Profiler("4. delta") self.delta = delta = R.get_delta(rho_order=rho_order, delta_i_max=delta_i_max) P = Profiler("5. find cluster maxima") #---- Now hunting for clusters ---Lot's of room for improvement (or simplification) here!!! cluster_id = flex.int(NN, -1) # default -1 means no cluster delta_order = flex.sort_permutation(delta, reverse=True) N_CLUST = 10 # maximum of 10 points to be considered as possible clusters #MAX_PERCENTILE_DELTA = 0.99 # cluster centers have to be in the top 10% percentile delta MAX_PERCENTILE_RHO = 0.99 # cluster centers have to be in the top 75% percentile rho n_cluster = 0 #max_n_delta = min(N_CLUST, int(MAX_PERCENTILE_DELTA*NN)) for ic in xrange(NN): # test the density, rho item_idx = delta_order[ic] if delta[item_idx] > 100: print "A: iteration", ic, "delta", delta[ item_idx], delta[item_idx] < 0.25 * delta[delta_order[0]] if delta[item_idx] < 0.25 * delta[ delta_order[0]]: # too low (another heuristic!) continue item_rho_order = rho_order_list.index(item_idx) if delta[item_idx] > 100: print "B: iteration", ic, item_rho_order, item_rho_order / NN, MAX_PERCENTILE_RHO if item_rho_order / NN < MAX_PERCENTILE_RHO: cluster_id[item_idx] = n_cluster print ic, item_idx, item_rho_order, cluster_id[item_idx] n_cluster += 1 print "Found %d clusters" % n_cluster for x in xrange(NN): if cluster_id[x] >= 0: print "XC", x, cluster_id[x], rho[x], delta[x] self.cluster_id_maxima = cluster_id.deep_copy() P = Profiler("6. assign all points") R.cluster_assignment(rho_order, cluster_id) self.cluster_id_full = cluster_id.deep_copy() # assign the halos P = Profiler("7. assign halos") halo = flex.bool(NN, False) border = R.get_border(cluster_id=cluster_id) for ic in range(n_cluster ): #loop thru all border regions; find highest density print "cluster", ic, "in border", border.count(True) this_border = (cluster_id == ic) & (border == True) print len(this_border), this_border.count(True) if this_border.count(True) > 0: highest_density = flex.max(rho.select(this_border)) halo_selection = (rho < highest_density) & (this_border == True) if halo_selection.count(True) > 0: cluster_id.set_selected(halo_selection, -1) core_selection = (cluster_id == ic) & ~halo_selection highest_density = flex.max(rho.select(core_selection)) too_sparse = core_selection & ( rho.as_double() < highest_density / 10. ) # another heuristic if too_sparse.count(True) > 0: cluster_id.set_selected(too_sparse, -1) self.cluster_id_final = cluster_id.deep_copy() print "%d in the excluded halo" % ((cluster_id == -1).count(True))