def get_group_symmetry_reference_matched(self, ref_cs):
        ref_v6 = xtal.v6cell(ref_cs.niggli_cell().unit_cell())
        ncdists = []
        for i, keys in enumerate(self.groups):
            v6 = xtal.v6cell(
                uctbx.unit_cell(self._average_p1_cell(keys)).niggli_cell())
            ncdists.append(NCDist(v6, ref_v6))
            print "Group %d: NCDist to reference: %f" % (i + 1, ncdists[-1])

        return ncdists.index(min(ncdists)) + 1
Exemplo n.º 2
0
 def distance_from(self, other_uc):
     """
 Calculates distance using NCDist from Andrews and Bernstein J. Appl.
 Cryst. 2014 between this frame and some other unit cell.
 :param:other_uc: a 6-tuple of a, b, c, alpha, beta, gamma for some unit cell
 :return: the NCDist in A^2 to other_uc
 """
     from cctbx.uctbx.determine_unit_cell import NCDist
     self_g6 = self.make_g6(self.uc)
     other_g6 = self.make_g6(other_uc)
     return NCDist(self_g6, other_g6)
Exemplo n.º 3
0
def run_one(path):
    cells = [g for g in generate_unit_cells_from_text(path)]
    g6 = [SingleFrame.make_g6(u) for u in cells]

    # for the purpose of this test, cycle through pairs of g6 vectors
    for ix in xrange(len(g6) - 1):
        a = g6[ix]
        b = g6[ix + 1]
        old = NCDist(a, b)
        new = NCDist2017(a, b)
        com = NCDist2017(b, a)
        assert old == new
        assert new == com
def run_one(path):
    cells = [g for g in generate_unit_cells_from_text(path)]
    g6 = [SingleFrame.make_g6(u) for u in cells]

    # for the purpose of this test, cycle through pairs of g6 vectors
    for ix in range(len(g6) - 1):
        a = g6[ix]
        b = g6[ix + 1]
        old = NCDist(a, b)
        # workaround allows use of non-thread-safe NCDist, even if openMP is enabled elsewhere in the Python program
        import os, omptbx
        workaround_nt = int(os.environ.get("OMP_NUM_THREADS", 1))
        omptbx.omp_set_num_threads(1)
        new = NCDist2017(a, b)
        com = NCDist2017(b, a)
        omptbx.omp_set_num_threads(workaround_nt)
        assert old == new, "Zeldin, AB2017"
        assert new == com, "Pair %d NCDist(a,b) %f != NCDist(b,a) %f" % (
            ix, new, com)
Exemplo n.º 5
0
    def ab_cluster(self,
                   threshold=10000,
                   method='distance',
                   linkage_method='single',
                   log=False,
                   ax=None,
                   write_file_lists=True,
                   schnell=False,
                   doplot=True,
                   labels='default'):
        """
    Hierarchical clustering using the unit cell dimentions.

    :param threshold: the threshold to use for prunning the tree into clusters.
    :param method: which clustering method from scipy to use when creating the tree (see scipy.cluster.hierarchy)
    :param linkage_method: which linkage method from scipy to use when creating the linkages. x (see scipy.cluster.hierarchy)
    :param log: if True, use log scale on y axis.
    :param ax: if a matplotlib axes object is provided, plot to this. Otherwise, create a new axes object and display on screen.
    :param write_file_lists: if True, write out the files that make up each cluster.
    :param schnell: if True, use simple euclidian distance, otherwise, use Andrews-Berstein distance from Andrews & Bernstein J Appl Cryst 47:346 (2014) on the Niggli cells.
    :param doplot: Boolean flag for if the plotting should be done at all.
    Runs faster if switched off.
    :param labels: 'default' will not display any labels for more than 100 images, but will display file names for fewer. This can be manually overidden with a boolean flag.
    :return: A list of Clusters ordered by largest Cluster to smallest

    .. note::
      Use 'schnell' option with caution, since it can cause strange behaviour
      around symmetry boundaries.
    """

        logging.info("Hierarchical clustering of unit cells")
        import scipy.spatial.distance as dist
        import scipy.cluster.hierarchy as hcluster

        # 1. Create a numpy array of G6 cells
        g6_cells = np.array(
            [SingleFrame.make_g6(image.uc) for image in self.members])

        # 2. Do hierarchichal clustering, using the find_distance method above.
        if schnell:
            logging.info("Using Euclidean distance")
            pair_distances = dist.pdist(g6_cells, metric='euclidean')
            logging.info("Distances have been calculated")
            this_linkage = hcluster.linkage(pair_distances,
                                            method=linkage_method,
                                            metric='euclidean')
        else:
            logging.info(
                "Using Andrews-Bernstein distance from Andrews & Bernstein "
                "J Appl Cryst 47:346 (2014)")
            pair_distances = dist.pdist(g6_cells,
                                        metric=lambda a, b: NCDist(a, b))
            logging.info("Distances have been calculated")
            this_linkage = hcluster.linkage(pair_distances,
                                            method=linkage_method,
                                            metric=lambda a, b: NCDist(a, b))
        cluster_ids = hcluster.fcluster(this_linkage,
                                        threshold,
                                        criterion=method)
        logging.debug("Clusters have been calculated")

        # 3. Create an array of sub-cluster objects from the clustering
        sub_clusters = []
        for cluster in range(max(cluster_ids)):
            info_string = ('Made using ab_cluster with t={},'
                           ' {} method, and {} linkage').format(
                               threshold, method, linkage_method)
            sub_clusters.append(
                self.make_sub_cluster([
                    self.members[i] for i in range(len(self.members))
                    if cluster_ids[i] == cluster + 1
                ], 'cluster_{}'.format(cluster + 1), info_string))

        sub_clusters = sorted(sub_clusters, key=lambda x: len(x.members))
        # Rename to order by size
        for num, cluster in enumerate(sub_clusters):
            cluster.cname = 'cluster_{}'.format(num + 1)

        # 3.5 optionally write out the clusters to files.
        if write_file_lists:
            for cluster in sub_clusters:
                if len(cluster.members) > 1:
                    cluster.dump_file_list(
                        out_file_name="{}.lst".format(cluster.cname))

        if doplot:
            if labels is True:
                labels = [image.name for image in self.members]
            elif labels is False:
                labels = ['' for _ in self.members]
            elif labels == 'default':
                if len(self.members) > 100:
                    labels = ['' for _ in self.members]
                else:
                    labels = [image.name for image in self.members]
            else:
                labels = [getattr(v, labels, '') for v in self.members]

            # 4. Plot a dendogram to the axes if no axis is passed, otherwise just
            #    return the axes object
            if ax is None:
                fig = plt.figure("Distance Dendogram")
                ax = fig.gca()
                direct_visualisation = True
            else:
                direct_visualisation = False

            hcluster.dendrogram(this_linkage,
                                labels=labels,
                                leaf_font_size=8,
                                leaf_rotation=90.0,
                                color_threshold=threshold,
                                ax=ax)

            if log:
                ax.set_yscale("symlog", linthreshx=(-1, 1))
            else:
                ax.set_ylim(-ax.get_ylim()[1] / 100, ax.get_ylim()[1])

            if direct_visualisation:
                fig.savefig("{}_dendogram.pdf".format(self.cname))
                plt.show()

        return sub_clusters, ax
Exemplo n.º 6
0
def set_chunk_stats(chunk,
                    stats,
                    stat_choice,
                    n_residues=None,
                    ref_cell=None,
                    space_group=None,
                    d_min=None,
                    ref_data=None):
    if "reslimit" in stat_choice: stats["reslimit"].append(chunk.res_lim)
    else: stats["reslimit"].append(float("nan"))

    if "pr" in stat_choice: stats["pr"].append(chunk.profile_radius)
    else: stats["pr"].append(float("nan"))

    stats["ccref"].append(float("nan"))

    if set(["ioversigma", "resnatsnr1", "ccref"]).intersection(stat_choice):
        iobs = chunk.data_array(space_group, False)
        iobs = iobs.select(iobs.sigmas() > 0).merge_equivalents(
            use_internal_variance=False).array()
        binner = iobs.setup_binner(auto_binning=True)

        if "resnatsnr1" in stat_choice:
            res = float("nan")
            for i_bin in binner.range_used():
                sel = binner.selection(i_bin)
                tmp = iobs.select(sel)
                if tmp.size() == 0: continue
                sn = flex.mean(tmp.data() / tmp.sigmas())
                if sn <= 1:
                    res = binner.bin_d_range(i_bin)[1]
                    break

            stats["resnatsnr1"].append(res)
        else:
            stats["resnatsnr1"].append(float("nan"))

        if d_min: iobs = iobs.resolution_filter(d_min=d_min)

        if "ccref" in stat_choice:
            corr = iobs.correlation(ref_data, assert_is_similar_symmetry=False)
            if corr.is_well_defined(): stats["ccref"][-1] = corr.coefficient()

        if "ioversigma" in stat_choice:
            stats["ioversigma"].append(flex.mean(iobs.data() / iobs.sigmas()))
        else:
            stats["ioversigma"].append(float("nan"))

    else:
        stats["ioversigma"].append(float("nan"))
        stats["resnatsnr1"].append(float("nan"))

    if "abdist" in stat_choice:
        from cctbx.uctbx.determine_unit_cell import NCDist
        G6a, G6b = make_G6(ref_cell), make_G6(chunk.cell)
        abdist = NCDist(G6a, G6b)
        stats["abdist"].append(abdist)
    else:
        stats["abdist"].append(float("nan"))

    if "wilsonb" in stat_choice:
        iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0)
        stats["wilsonb"].append(iso_scale_and_b.b_wilson)
    else:
        stats["wilsonb"].append(float("nan"))
Exemplo n.º 7
0
    def ab_cluster(self,
                   threshold=10000,
                   method='distance',
                   linkage_method='single',
                   log=False,
                   plot=False):
        """ Do basic hierarchical clustering using the Andrews-Berstein distance
    on the Niggli cells """
        print("Hierarchical clustering of unit cells:")
        import scipy.spatial.distance as dist

        print(
            "Using Andrews-Bernstein Distance from Andrews & Bernstein J Appl Cryst 47:346 (2014)."
        )

        def make_g6(uc):
            """ Take a reduced Niggli Cell, and turn it into the G6 representation """
            a = uc[0]**2
            b = uc[1]**2
            c = uc[2]**2
            d = 2 * uc[1] * uc[2] * math.cos(uc[3])
            e = 2 * uc[0] * uc[2] * math.cos(uc[4])
            f = 2 * uc[0] * uc[1] * math.cos(uc[5])
            return [a, b, c, d, e, f]

        # 1. Create a numpy array of G6 cells
        g6_cells = np.array([make_g6(image.uc) for image in self.members])

        # 2. Do hierarchichal clustering, using the find_distance method above.
        pair_distances = dist.pdist(g6_cells, metric=lambda a, b: NCDist(a, b))
        logging.debug("Distances have been calculated")
        this_linkage = hcluster.linkage(pair_distances,
                                        method=linkage_method,
                                        metric=lambda a, b: NCDist(a, b))
        cluster_ids = hcluster.fcluster(this_linkage,
                                        threshold,
                                        criterion=method)
        logging.debug("Clusters have been calculated")
        # Create an array of sub-cluster objects from the clustering
        sub_clusters = []
        for cluster in range(max(cluster_ids)):
            info_string = ('Made using ab_cluster with t={},'
                           ' {} method, and {} linkage').format(
                               threshold, method, linkage_method)
            sub_clusters.append(
                self.make_sub_cluster([
                    self.members[i] for i in range(len(self.members))
                    if cluster_ids[i] == cluster + 1
                ], 'cluster_{}'.format(cluster + 1), info_string))

        # 3. print out some information that is useful.
        out_str = "{} clusters have been identified.".format(max(cluster_ids))
        out_str += "\n{:^5} {:^14} {:<11} {:<11} {:<11} {:<12} {:<12} {:<12}".format(
            "C_id", "Num in cluster", "Med_a", "Med_b", "Med_c", "Med_alpha",
            "Med_beta", "Med_gamma")
        singletons = []
        for cluster in sub_clusters:
            if len(cluster.members) != 1:

                sorted_pg_comp = sorted(list(cluster.pg_composition.items()),
                                        key=lambda x: -1 * x[1])
                pg_strings = [
                    "{} in {}".format(pg[1], pg[0]) for pg in sorted_pg_comp
                ]
                point_group_string = ", ".join(pg_strings) + "."
                out_str += (
                    "\n{:^5} {:^14} {:<5.1f}({:<4.1f}) {:<5.1f}({:<4.1f})"
                    " {:<5.1f}({:<4.1f}) {:<6.2f}({:<4.2f}) {:<6.2f}"
                    "({:<4.2f}) {:<6.2f}({:<4.2f})").format(
                        cluster.cname, len(cluster.members),
                        cluster.medians[0], cluster.stdevs[0],
                        cluster.medians[1], cluster.stdevs[1],
                        cluster.medians[2], cluster.stdevs[2],
                        cluster.medians[3], cluster.stdevs[3],
                        cluster.medians[4], cluster.stdevs[4],
                        cluster.medians[5], cluster.stdevs[5])
                out_str += "\n" + point_group_string
            else:
                singletons.append("".join([
                    ("{:<14} {:<11.1f} {:<11.1f} {:<11.1f}"
                     "{:<12.1f} {:<12.1f} {:<12.1f}").format(
                         list(cluster.pg_composition.keys())[0],
                         cluster.members[0].uc[0], cluster.members[0].uc[1],
                         cluster.members[0].uc[2], cluster.members[0].uc[3],
                         cluster.members[0].uc[4], cluster.members[0].uc[5]),
                    '\n'
                ]))
        out_str += "\nStandard deviations are in brackets."
        out_str += "\n" + str(len(singletons)) + " singletons:"
        out_str += "\n{:^14} {:<11} {:<11} {:<11} {:<12} {:<12} {:<12}".format(
            "Point group", "a", "b", "c", "alpha", "beta", "gamma")
        out_str += "".join(singletons)
        print(out_str)

        if plot:
            import matplotlib.pyplot as plt

            fig = plt.figure("Distance Dendogram")
            hcluster.dendrogram(this_linkage,
                                labels=[image.name for image in self.members],
                                leaf_font_size=8,
                                color_threshold=threshold)
            ax = fig.gca()
            if log:
                ax.set_yscale("log")
            else:
                ax.set_ylim(-ax.get_ylim()[1] / 100, ax.get_ylim()[1])
            fig.savefig("{}_dendogram.pdf".format(self.cname))
            plt.show()

        return sub_clusters
def calc_stats(xac_file,
               stat_choice,
               n_residues=None,
               ref_v6cell=None,
               min_peak=None,
               min_peak_percentile=None,
               correct_peak=None):
    # Open XDS_ASCII
    if xac_file.endswith(".pkl"): xac = pickle.load(open(xac_file))
    else: xac = xds_ascii.XDS_ASCII(xac_file)

    sel_remove = flex.bool(xac.iobs.size(), False)
    if min_peak is not None:
        sel = xac.peak < min_peak
        sel_remove |= sel
    elif min_peak_percentile is not None:
        q = numpy.percentile(xac.peak, min_peak_percentile)
        print "percentile %.2f %s" % (q, xac)
        sel = xac.peak < q
        sel_remove |= sel

    if correct_peak: sel_remove |= (xac.peak < 1)  # remove PEAK==0

    xac.remove_selection(sel_remove)

    if params.correct_peak:
        xac.iobs *= xac.peak * .01
        xac.sigma_iobs *= xac.peak * .01

    iobs = xac.i_obs(anomalous_flag=False)
    iobs = iobs.select(iobs.sigmas() > 0).merge_equivalents(
        use_internal_variance=False).array()

    stats = dict(filename=xac_file, cell=iobs.unit_cell().parameters())

    if iobs.size() == 0:
        return stats

    if "ioversigma" in stat_choice or "resnatsnr1" in stat_choice:
        binner = iobs.setup_binner(auto_binning=True)

        if "ioversigma" in stat_choice:
            stats["ioversigma"] = flex.mean(iobs.data() / iobs.sigmas())

        if "resnatsnr1" in stat_choice:
            res = float("nan")
            for i_bin in binner.range_used():
                sel = binner.selection(i_bin)
                tmp = iobs.select(sel)
                if tmp.size() == 0: continue
                sn = flex.mean(tmp.data() / tmp.sigmas())
                if sn <= 1:
                    res = binner.bin_d_range(i_bin)[1]
                    break

            stats["resnatsnr1"] = res

    if "abdist" in stat_choice:
        from cctbx.uctbx.determine_unit_cell import NCDist
        G6a, G6b = ref_v6cell, v6cell(iobs.unit_cell().niggli_cell())
        abdist = NCDist(G6a, G6b)
        stats["abdist"] = abdist

    if "wilsonb" in stat_choice:
        iso_scale_and_b = ml_iso_absolute_scaling(iobs, n_residues, 0)
        stats["wilsonb"] = iso_scale_and_b.b_wilson

    print stats
    return stats