Пример #1
0
    def __init__(self, **kwargs):
        group_args.__init__(self, **kwargs)
        # require Dij, d_c
        P = Profiler("2. calculate rho density")
        print("finished Dij, now calculating rho_i, the density")
        from xfel.clustering import Rodriguez_Laio_clustering_2014
        R = Rodriguez_Laio_clustering_2014(distance_matrix=self.Dij,
                                           d_c=self.d_c)
        self.rho = rho = R.get_rho()
        ave_rho = flex.mean(rho.as_double())
        NN = self.Dij.focus()[0]
        print("The average rho_i is %5.2f, or %4.1f%%" %
              (ave_rho, 100 * ave_rho / NN))
        i_max = flex.max_index(rho)

        P = Profiler("3.transition")
        print("the index with the highest density is %d" % (i_max))
        delta_i_max = flex.max(
            flex.double([self.Dij[i_max, j] for j in range(NN)]))
        print("delta_i_max", delta_i_max)
        rho_order = flex.sort_permutation(rho, reverse=True)
        rho_order_list = list(rho_order)

        P = Profiler("4. delta")
        self.delta = delta = R.get_delta(rho_order=rho_order,
                                         delta_i_max=delta_i_max)

        P = Profiler("5. find cluster maxima")
        #---- Now hunting for clusters
        cluster_id = flex.int(NN, -1)  # default -1 means no cluster
        delta_order = flex.sort_permutation(delta, reverse=True)
        N_CLUST = 10  # maximum of 10 points to be considered as possible clusters
        MAX_PERCENTILE_DELTA = 0.10  # cluster centers have to be in the top 10% percentile delta
        MAX_PERCENTILE_RHO = 0.75  # cluster centers have to be in the top 75% percentile rho
        n_cluster = 0
        max_n_delta = min(N_CLUST, int(MAX_PERCENTILE_DELTA * NN))
        for ic in range(max_n_delta):
            # test the density, rho
            item_idx = delta_order[ic]
            if delta[item_idx] < 0.25 * delta[
                    delta_order[0]]:  # too low (another heuristic!)
                continue
            item_rho_order = rho_order_list.index(item_idx)
            if item_rho_order / NN < MAX_PERCENTILE_RHO:
                cluster_id[item_idx] = n_cluster
                print(ic, item_idx, item_rho_order, cluster_id[item_idx])
                n_cluster += 1
        print("Found %d clusters" % n_cluster)
        for x in range(NN):
            if cluster_id[x] >= 0:
                print("XC", x, cluster_id[x], rho[x], delta[x])
        self.cluster_id_maxima = cluster_id.deep_copy()

        P = Profiler("6. assign all points")
        R.cluster_assignment(rho_order, cluster_id)

        self.cluster_id_full = cluster_id.deep_copy()

        # assign the halos
        P = Profiler("7. assign halos")
        halo = flex.bool(NN, False)
        border = R.get_border(cluster_id=cluster_id)

        for ic in range(n_cluster
                        ):  #loop thru all border regions; find highest density
            print("cluster", ic, "in border", border.count(True))
            this_border = (cluster_id == ic) & (border == True)
            print(len(this_border), this_border.count(True))
            if this_border.count(True) > 0:
                highest_density = flex.max(rho.select(this_border))
                halo_selection = (rho < highest_density) & (this_border
                                                            == True)
                if halo_selection.count(True) > 0:
                    cluster_id.set_selected(halo_selection, -1)
                core_selection = (cluster_id == ic) & ~halo_selection
                highest_density = flex.max(rho.select(core_selection))
                too_sparse = core_selection & (
                    rho.as_double() < highest_density / 10.
                )  # another heuristic
                if too_sparse.count(True) > 0:
                    cluster_id.set_selected(too_sparse, -1)
        self.cluster_id_final = cluster_id.deep_copy()
        print("%d in the excluded halo" % ((cluster_id == -1).count(True)))
 def __init__(self, **kwargs):
     group_args.__init__(self, **kwargs)
     mandatory = ["ORI", "MILLER", "BEAM", "WAVE", "ICALCVEC", "IOBSVEC"]
     for key in mandatory:
         getattr(self, key)
     self.DSSQ = self.ORI.unit_cell().d_star_sq(self.MILLER)
 def __init__(self, **kwargs):
   group_args.__init__(self,**kwargs)
   mandatory = ["ORI","MILLER","BEAM","WAVE","ICALCVEC","IOBSVEC"]
   for key in mandatory: getattr(self,key)
   self.DSSQ = self.ORI.unit_cell().d_star_sq(self.MILLER)
Пример #4
0
    def __init__(self, **kwargs):
        group_args.__init__(self, **kwargs)
        print('finished Dij, now calculating rho_i and density')
        from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL
        R = RL(distance_matrix=self.Dij, d_c=self.d_c)
        #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding
        #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True)
        if hasattr(self, 'strategy') is False:
            self.strategy = 'default'
        self.rho = rho = R.get_rho()
        ave_rho = flex.mean(rho.as_double())
        NN = self.Dij.focus()[0]
        i_max = flex.max_index(rho)
        delta_i_max = flex.max(
            flex.double([self.Dij[i_max, j] for j in range(NN)]))
        rho_order = flex.sort_permutation(rho, reverse=True)
        rho_order_list = list(rho_order)
        self.delta = delta = R.get_delta(rho_order=rho_order,
                                         delta_i_max=delta_i_max)
        cluster_id = flex.int(NN, -1)  # -1 means no cluster
        delta_order = flex.sort_permutation(delta, reverse=True)
        MAX_PERCENTILE_RHO = self.max_percentile_rho  # cluster centers have to be in the top percentile
        n_cluster = 0
        #
        #
        print('Z_DELTA = ', self.Z_delta)

        pick_top_solution = False
        rho_stdev = flex.mean_and_variance(
            rho.as_double()).unweighted_sample_standard_deviation()
        delta_stdev = flex.mean_and_variance(
            delta).unweighted_sample_standard_deviation()
        if rho_stdev != 0.0 and delta_stdev != 0:
            rho_z = (rho.as_double() -
                     flex.mean(rho.as_double())) / (rho_stdev)
            delta_z = (delta - flex.mean(delta)) / (delta_stdev)
        else:
            pick_top_solution = True
            if rho_stdev == 0.0:
                centroids = [flex.first_index(delta, flex.max(delta))]
            elif delta_stdev == 0.0:
                centroids = [flex.first_index(rho, flex.max(rho))]

        significant_delta = []
        significant_rho = []
        # Define strategy to decide cluster center here. Only one should be true
        debug_fix_clustering = True
        if self.strategy == 'one_cluster':
            debug_fix_clustering = False
            strategy2 = True
        if self.strategy == 'strategy_3':
            debug_fix_clustering = False
            strategy3 = True
            strategy2 = False

        if debug_fix_clustering:
            if not pick_top_solution:
                delta_z_cutoff = min(1.0, max(delta_z))
                rho_z_cutoff = min(1.0, max(rho_z))
                for ic in range(NN):
                    # test the density & rho
                    if delta_z[ic] >= delta_z_cutoff or delta_z[
                            ic] <= -delta_z_cutoff:
                        significant_delta.append(ic)
                    if rho_z[ic] >= rho_z_cutoff or rho_z[ic] <= -rho_z_cutoff:
                        significant_rho.append(ic)
                if True:
                    # Use idea quoted in Rodriguez Laio 2014 paper
                    # " Thus, cluster centers are recognized as points for which the value of delta is anomalously large."
                    centroid_candidates = list(significant_delta)
                    candidate_delta_z = flex.double()
                    for ic in centroid_candidates:
                        if ic == rho_order[0]:
                            delta_z_of_rho_order_0 = delta_z[ic]
                        candidate_delta_z.append(delta_z[ic])
                    i_sorted = flex.sort_permutation(candidate_delta_z,
                                                     reverse=True)
                    # Check that once sorted the top one is not equal to the 2nd or 3rd position
                    # If there is a tie, assign centroid to the first one in rho order
                    centroids = []
                    # rho_order[0] has to be a centroid
                    centroids.append(rho_order[0])

                    #centroids.append(centroid_candidates[i_sorted[0]])
                    for i in range(0, len(i_sorted[:])):
                        if centroid_candidates[i_sorted[i]] == rho_order[0]:
                            continue
                        if delta_z_of_rho_order_0 - candidate_delta_z[
                                i_sorted[i]] > 1.0:
                            if i > 1:
                                if -candidate_delta_z[i_sorted[
                                        i - 1]] + candidate_delta_z[
                                            i_sorted[0]] > 1.0:
                                    centroids.append(
                                        centroid_candidates[i_sorted[i]])
                            else:
                                centroids.append(
                                    centroid_candidates[i_sorted[i]])
                        else:
                            break
                if False:
                    centroid_candidates = list(
                        set(significant_delta).intersection(
                            set(significant_rho)))
                    # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev
                    centroids = []
                    max_delta_z_candidates = -999.9
                    max_rho_z_candidates = -999.9
                    for ic in centroid_candidates:
                        if delta_z[ic] > max_delta_z_candidates:
                            max_delta_z_candidates = delta_z[ic]
                        if rho_z[ic] > max_rho_z_candidates:
                            max_rho_z_candidates = rho_z[ic]
                    for ic in centroid_candidates:
                        if max_delta_z_candidates - delta_z[
                                ic] < 1.0 and max_rho_z_candidates - rho_z[
                                    ic] < 1.0:
                            centroids.append(ic)

            #item_idxs = [delta_order[ic] for ic,centroid in enumerate(centroids)]
            item_idxs = centroids
            for item_idx in item_idxs:
                cluster_id[item_idx] = n_cluster
                print('CLUSTERING_STATS', item_idx, cluster_id[item_idx])
                n_cluster += 1
                ####
        elif strategy2:
            # Go through list of clusters, see which one has highest joint rank in both rho and delta lists
            # This will only assign one cluster center based on highest product of rho and delta ranks
            product_list_of_ranks = []
            for ic in range(NN):
                rho_tmp = self.rho[ic]
                delta_tmp = self.delta[ic]
                product_list_of_ranks.append(rho_tmp * delta_tmp)
            import numpy as np
            item_idx = np.argmax(product_list_of_ranks)
            cluster_id[item_idx] = n_cluster  # Only cluster assigned
            print('CLUSTERING_STATS', item_idx, cluster_id[item_idx])
            n_cluster += 1
        elif strategy3:
            # use product of delta and rho and pick out top candidates
            # have to use a significance z_score to filter out the very best
            product_list_of_ranks = flex.double()
            for ic in range(NN):
                rho_tmp = self.rho[ic]
                delta_tmp = self.delta[ic]
                product_list_of_ranks.append(rho_tmp * delta_tmp)
            import numpy as np
            iid_sorted = flex.sort_permutation(product_list_of_ranks,
                                               reverse=True)
            cluster_id[
                iid_sorted[0]] = n_cluster  # first point always a cluster
            n_cluster += 1
            print('CLUSTERING_STATS S3', iid_sorted[0],
                  cluster_id[iid_sorted[0]])
            #product_list_of_ranks[iid_sorted[0]]=0.0 # set this to 0.0 so that the mean/stdev does not get biased by one point
            stdev = np.std(product_list_of_ranks)
            mean = np.mean(product_list_of_ranks)
            n_sorted = 3
            #if stdev == 0.0:
            #  n_sorted=1

            z_critical = 3.0  # 2 sigma significance ?
            # Only go through say 3-4 datapoints
            # basically there won't be more than 2-3 lattices on an image realistically
            for iid in iid_sorted[1:n_sorted]:
                z_score = (product_list_of_ranks[iid] - mean) / stdev
                if z_score > z_critical:
                    cluster_id[iid] = n_cluster
                    n_cluster += 1
                    print('CLUSTERING_STATS S3', iid, cluster_id[iid])
                else:
                    break  # No point going over all points once below threshold z_score

        else:
            for ic in range(NN):
                item_idx = delta_order[ic]
                if ic != 0:
                    if delta[item_idx] <= 0.25 * delta[
                            delta_order[0]]:  # too low to be a medoid
                        continue
                item_rho_order = rho_order_list.index(item_idx)
                if (item_rho_order) / NN < MAX_PERCENTILE_RHO:
                    cluster_id[item_idx] = n_cluster
                    print('CLUSTERING_STATS', ic, item_idx, item_rho_order,
                          cluster_id[item_idx])
                    n_cluster += 1
        ###


#
        print('Found %d clusters' % n_cluster)
        for x in range(NN):
            if cluster_id[x] >= 0:
                print("XC", x, cluster_id[x], rho[x], delta[x])
        self.cluster_id_maxima = cluster_id.deep_copy()
        R.cluster_assignment(rho_order, cluster_id, rho)
        self.cluster_id_full = cluster_id.deep_copy()

        #halo = flex.bool(NN,False)
        #border = R.get_border( cluster_id = cluster_id )

        #for ic in range(n_cluster): #loop thru all border regions; find highest density
        #  this_border = (cluster_id == ic) & (border==True)
        #  if this_border.count(True)>0:
        #    highest_density = flex.max(rho.select(this_border))
        #    halo_selection = (rho < highest_density) & (this_border==True)
        #    if halo_selection.count(True)>0:
        #      cluster_id.set_selected(halo_selection,-1)
        #    core_selection = (cluster_id == ic) & ~halo_selection
        #    highest_density = flex.max(rho.select(core_selection))
        #    too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic
        #    if too_sparse.count(True)>0:
        #      cluster_id.set_selected(too_sparse,-1)
        self.cluster_id_final = cluster_id.deep_copy()
    def __init__(self, **kwargs):
        group_args.__init__(self, **kwargs)
        print('finished Dij, now calculating rho_i and density')
        from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL
        R = RL(distance_matrix=self.Dij, d_c=self.d_c)
        #from IPython import embed; embed(); exit()
        #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding
        #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True)
        self.rho = rho = R.get_rho()
        ave_rho = flex.mean(rho.as_double())
        NN = self.Dij.focus()[0]
        i_max = flex.max_index(rho)
        delta_i_max = flex.max(
            flex.double([self.Dij[i_max, j] for j in range(NN)]))
        rho_order = flex.sort_permutation(rho, reverse=True)
        rho_order_list = list(rho_order)
        self.delta = delta = R.get_delta(rho_order=rho_order,
                                         delta_i_max=delta_i_max)
        cluster_id = flex.int(NN, -1)  # -1 means no cluster
        delta_order = flex.sort_permutation(delta, reverse=True)
        MAX_PERCENTILE_RHO = self.max_percentile_rho  # cluster centers have to be in the top percentile
        n_cluster = 0
        #
        pick_top_solution = False
        rho_stdev = flex.mean_and_variance(
            rho.as_double()).unweighted_sample_standard_deviation()
        delta_stdev = flex.mean_and_variance(
            delta).unweighted_sample_standard_deviation()
        if rho_stdev != 0.0 and delta_stdev != 0:
            rho_z = (rho.as_double() -
                     flex.mean(rho.as_double())) / (rho_stdev)
            delta_z = (delta - flex.mean(delta)) / (delta_stdev)
        else:
            pick_top_solution = True
            if rho_stdev == 0.0:
                centroids = [flex.first_index(delta, flex.max(delta))]
            elif delta_stdev == 0.0:
                centroids = [flex.first_index(rho, flex.max(rho))]

        significant_delta = []
        significant_rho = []
        debug_fix_clustering = True
        if debug_fix_clustering:
            if not pick_top_solution:
                delta_z_cutoff = min(1.0, max(delta_z))
                rho_z_cutoff = min(1.0, max(rho_z))
                for ic in range(NN):
                    # test the density & rho
                    if delta_z[ic] >= delta_z_cutoff:
                        significant_delta.append(ic)
                    if rho_z[ic] >= rho_z_cutoff:
                        significant_rho.append(ic)
                centroid_candidates = list(
                    set(significant_delta).intersection(set(significant_rho)))
                # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev
                centroids = []
                max_delta_z_candidates = -999.9
                max_rho_z_candidates = -999.9
                for ic in centroid_candidates:
                    if delta_z[ic] > max_delta_z_candidates:
                        max_delta_z_candidates = delta_z[ic]
                    if rho_z[ic] > max_rho_z_candidates:
                        max_rho_z_candidates = rho_z[ic]
                for ic in centroid_candidates:
                    if max_delta_z_candidates - delta_z[
                            ic] < 1.0 and max_rho_z_candidates - rho_z[
                                ic] < 1.0:
                        centroids.append(ic)

            item_idxs = [
                delta_order[ic] for ic, centroid in enumerate(centroids)
            ]
            for item_idx in item_idxs:
                cluster_id[item_idx] = n_cluster
                print('CLUSTERING_STATS', item_idx, cluster_id[item_idx])
                n_cluster += 1
                ####
        else:
            for ic in range(NN):
                item_idx = delta_order[ic]
                if ic != 0:
                    if delta[item_idx] <= 0.25 * delta[
                            delta_order[0]]:  # too low to be a medoid
                        continue
                item_rho_order = rho_order_list.index(item_idx)
                if (item_rho_order) / NN < MAX_PERCENTILE_RHO:
                    cluster_id[item_idx] = n_cluster
                    print('CLUSTERING_STATS', ic, item_idx, item_rho_order,
                          cluster_id[item_idx])
                    n_cluster += 1
        ###


#
#
        print('Found %d clusters' % n_cluster)
        for x in range(NN):
            if cluster_id[x] >= 0:
                print("XC", x, cluster_id[x], rho[x], delta[x])
        self.cluster_id_maxima = cluster_id.deep_copy()
        R.cluster_assignment(rho_order, cluster_id)
        self.cluster_id_full = cluster_id.deep_copy()

        #halo = flex.bool(NN,False)
        #border = R.get_border( cluster_id = cluster_id )

        #for ic in range(n_cluster): #loop thru all border regions; find highest density
        #  this_border = (cluster_id == ic) & (border==True)
        #  if this_border.count(True)>0:
        #    highest_density = flex.max(rho.select(this_border))
        #    halo_selection = (rho < highest_density) & (this_border==True)
        #    if halo_selection.count(True)>0:
        #      cluster_id.set_selected(halo_selection,-1)
        #    core_selection = (cluster_id == ic) & ~halo_selection
        #    highest_density = flex.max(rho.select(core_selection))
        #    too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic
        #    if too_sparse.count(True)>0:
        #      cluster_id.set_selected(too_sparse,-1)
        self.cluster_id_final = cluster_id.deep_copy()
Пример #6
0
    def __init__(self, **kwargs):
        group_args.__init__(self, **kwargs)
        # require Dij, d_c
        P = Profiler("2. calculate rho density")
        print "finished Dij, now calculating rho_i, the density"
        from xfel.clustering import Rodriguez_Laio_clustering_2014
        # alternative clustering algorithms: see http://scikit-learn.org/stable/modules/clustering.html
        # also see https://cran.r-project.org/web/packages/dbscan/vignettes/hdbscan.html
        # see also https://en.wikipedia.org/wiki/Hausdorff_dimension

        R = Rodriguez_Laio_clustering_2014(distance_matrix=self.Dij,
                                           d_c=self.d_c)
        self.rho = rho = R.get_rho()
        ave_rho = flex.mean(rho.as_double())
        NN = self.Dij.focus()[0]
        print "The average rho_i is %5.2f, or %4.1f%%" % (ave_rho,
                                                          100 * ave_rho / NN)
        i_max = flex.max_index(rho)

        P = Profiler("3.transition")
        print "the index with the highest density is %d" % (i_max)
        delta_i_max = flex.max(
            flex.double([self.Dij[i_max, j] for j in xrange(NN)]))
        print "delta_i_max", delta_i_max
        rho_order = flex.sort_permutation(rho, reverse=True)
        rho_order_list = list(rho_order)

        P = Profiler("4. delta")
        self.delta = delta = R.get_delta(rho_order=rho_order,
                                         delta_i_max=delta_i_max)

        P = Profiler("5. find cluster maxima")
        #---- Now hunting for clusters ---Lot's of room for improvement (or simplification) here!!!
        cluster_id = flex.int(NN, -1)  # default -1 means no cluster
        delta_order = flex.sort_permutation(delta, reverse=True)
        N_CLUST = 10  # maximum of 10 points to be considered as possible clusters
        #MAX_PERCENTILE_DELTA = 0.99 # cluster centers have to be in the top 10% percentile delta
        MAX_PERCENTILE_RHO = 0.99  # cluster centers have to be in the top 75% percentile rho
        n_cluster = 0
        #max_n_delta = min(N_CLUST, int(MAX_PERCENTILE_DELTA*NN))
        for ic in xrange(NN):
            # test the density, rho
            item_idx = delta_order[ic]
            if delta[item_idx] > 100:
                print "A: iteration", ic, "delta", delta[
                    item_idx], delta[item_idx] < 0.25 * delta[delta_order[0]]
            if delta[item_idx] < 0.25 * delta[
                    delta_order[0]]:  # too low (another heuristic!)
                continue
            item_rho_order = rho_order_list.index(item_idx)
            if delta[item_idx] > 100:
                print "B: iteration", ic, item_rho_order, item_rho_order / NN, MAX_PERCENTILE_RHO
            if item_rho_order / NN < MAX_PERCENTILE_RHO:
                cluster_id[item_idx] = n_cluster
                print ic, item_idx, item_rho_order, cluster_id[item_idx]
                n_cluster += 1
        print "Found %d clusters" % n_cluster
        for x in xrange(NN):
            if cluster_id[x] >= 0:
                print "XC", x, cluster_id[x], rho[x], delta[x]
        self.cluster_id_maxima = cluster_id.deep_copy()

        P = Profiler("6. assign all points")
        R.cluster_assignment(rho_order, cluster_id)

        self.cluster_id_full = cluster_id.deep_copy()

        # assign the halos
        P = Profiler("7. assign halos")
        halo = flex.bool(NN, False)
        border = R.get_border(cluster_id=cluster_id)

        for ic in range(n_cluster
                        ):  #loop thru all border regions; find highest density
            print "cluster", ic, "in border", border.count(True)
            this_border = (cluster_id == ic) & (border == True)
            print len(this_border), this_border.count(True)
            if this_border.count(True) > 0:
                highest_density = flex.max(rho.select(this_border))
                halo_selection = (rho < highest_density) & (this_border
                                                            == True)
                if halo_selection.count(True) > 0:
                    cluster_id.set_selected(halo_selection, -1)
                core_selection = (cluster_id == ic) & ~halo_selection
                highest_density = flex.max(rho.select(core_selection))
                too_sparse = core_selection & (
                    rho.as_double() < highest_density / 10.
                )  # another heuristic
                if too_sparse.count(True) > 0:
                    cluster_id.set_selected(too_sparse, -1)
        self.cluster_id_final = cluster_id.deep_copy()
        print "%d in the excluded halo" % ((cluster_id == -1).count(True))