def check_shape(ndim, cutoff, N=10):
        X = np.random.rand(N, ndim)
        mst = MSTClustering(cutoff=cutoff).fit(X)

        segments = mst.get_graph_segments()
        print(ndim, cutoff, segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1 - cutoff) for seg in segments)

        segments = mst.get_graph_segments(full_graph=True)
        print(segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1) for seg in segments)
示例#2
0
    def check_shape(ndim, cutoff, N=10):
        X = np.random.rand(N, ndim)
        mst = MSTClustering(cutoff=cutoff).fit(X)

        segments = mst.get_graph_segments()
        print(ndim, cutoff, segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1 - cutoff) for seg in segments)

        segments = mst.get_graph_segments(full_graph=True)
        print(segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1) for seg in segments)
def check_graph_segments_vals():
    X = np.arange(5)[:, None] ** 2
    mst = MSTClustering(cutoff=0).fit(X)
    segments = mst.get_graph_segments()
    assert len(segments) == 1
    assert_allclose(segments[0],
                    [[0, 4, 4, 9],
                     [1, 1, 9, 16]])
示例#4
0
def check_graph_segments_vals():
    X = np.arange(5)[:, None]**2
    mst = MSTClustering(cutoff=0).fit(X)
    segments = mst.get_graph_segments()
    assert len(segments) == 1
    assert_allclose(segments[0], [[0, 4, 4, 9], [1, 1, 9, 16]])
示例#5
0
def cluster_positions(positions, plots=False, cutoff_scale_min=120,
                      cutoff_scale_max=350, cutoff_scale_resolution=150,
                      n_neighbors_max=5, min_cluster_size=4):
    """
    Find clusters in the positions produced by
    `~shampoo.track2d.locate_from_hologram` using a Minimum Spanning Tree.

    Parameters
    ----------
    positions : `~numpy.ndarray`
        Positions for each detected specimen in each frame, with values
        specified by `~shampoo.track2d.locate_from_hologram`.
    plots : bool (optional)
        Plot the scores for the grid search in ``(cutoff_scale, n_neighbors)``
        space and the best clusters

    Returns
    -------
    labels : `~numpy.ndarray`
        Cluster labels for each position in ``positions``. `-1` represents
        positions without a cluster.
    """
    # Scale the time and max_intensity dims similarly to the spatial dimensions
    X = positions.copy()
    X = X[:, 0:4]
    X[:, 0] *= 5*positions[:, 1].ptp()/positions[:, 0].ptp()
    X[:, 3] *= positions[:, 1].ptp()/positions[:, 3].ptp()

    # Grid search in (cutoff_scales, n_neighbors) for the best clustering params
    cutoff_scales = np.linspace(cutoff_scale_min, cutoff_scale_max,
                                cutoff_scale_resolution)
    n_neighbors = np.arange(1, n_neighbors_max)
    scores = np.zeros((len(cutoff_scales), len(n_neighbors)), dtype=np.float64)

    for i in range(cutoff_scales.shape[0]):
        for j in range(n_neighbors.shape[0]):
            model = MSTClustering(cutoff_scale=cutoff_scales[i],
                                  approximate=True, n_neighbors=n_neighbors[j],
                                  min_cluster_size=min_cluster_size)
            labels = model.fit_predict(X)

            distance_stds = []
            for l in set(labels):
                if l != -1:
                    pca = PCA(n_components=3)
                    pca.fit(X[labels == l, 0:3])
                    X_prime = pca.transform(X[labels == l, 0:3])

                    distance_stds.append(X_prime[:, 1].std() /
                                         X_prime[:, 0].ptp())

            f_labeled = np.count_nonzero(labels != -1)/float(len(labels))
            scores[i, j] = np.mean(distance_stds)/f_labeled

    # With the best clustering parameters, label the clusters
    x_min_ind, y_min_ind = np.where(scores == scores.min())
    n_neighbors_min = n_neighbors[y_min_ind[0]]
    cuttoff_scale_min = cutoff_scales[x_min_ind[0]]
    print(n_neighbors_min, cuttoff_scale_min)

    model = MSTClustering(cutoff_scale=cuttoff_scale_min, approximate=True,
                          n_neighbors=n_neighbors_min, min_cluster_size=4)
    labels = model.fit_predict(X)

    if plots:
        # Plot the scores in (cutoff_scales, n_neighbors) space
        fig, ax = plt.subplots(figsize=(16, 10))
        ax.imshow(np.log(scores).T, interpolation='nearest', origin='lower',
                  cmap=plt.cm.viridis)
        ax.set_xticks(range(len(cutoff_scales))[::5])
        ax.set_xticklabels(["{0:.2f}".format(cutoff_scale)
                            for cutoff_scale in cutoff_scales[::5]])

        ax.set_yticks(range(len(n_neighbors)))
        ax.set_yticklabels(range(1, len(n_neighbors)+1))

        for l in ax.get_xticklabels():
            l.set_rotation(45)
            l.set_ha('right')

        ax.set_xlabel('cutoff')
        ax.set_ylabel('n_neighbors')
        ax.set_aspect(10)

        # Plot the best clusters
        plot_segments = True
        fig, ax = plt.subplots(1, 3, figsize=(16, 6))

        kwargs = dict(s=100, alpha=0.6, edgecolor='none', cmap=plt.cm.Spectral,
                      c=labels)
        ax[0].scatter(X[:, 0], X[:, 1], **kwargs)
        ax[1].scatter(X[:, 0], X[:, 2], **kwargs)
        ax[2].scatter(X[:, 1], X[:, 2], **kwargs)

        ax[0].set(xlabel='t', ylabel='x')
        ax[1].set(xlabel='t', ylabel='y')
        ax[2].set(xlabel='x', ylabel='y')

        if plot_segments:
            segments = model.get_graph_segments(full_graph=False)
            ax[0].plot(segments[0], segments[1], '-k')
            ax[1].plot(segments[0], segments[2], '-k')
            ax[2].plot(segments[1], segments[2], '-k')

        fig.tight_layout()

        plt.show()

    return labels
示例#6
0
class MST:
    """ Minimum Spanning Tree Class
    
    Compute MST for a set of input points using the MSTClustering 
    code from jakevdp, calculate branch lengths from the MST and 
    generate plots of the MST and cumulative distribution of branch 
    lengths.
    
    ---- Inputs ----
    data frame "df", which has two columns present:
     - ra: right ascension (deg)
     - dec: declination (deg)
    
    cutoff_scale (float): minimum size of edges, also known as the 
                          critical branch length. All edges larger 
                          than cutoff_scale will be removed.
    
    min_cluster_size (int): min number of galaxies in a cluster.
    
    n_neighbors (int): maximum number of neighbors of each point 
    used for approximate Euclidean MST algorithm.
    
    ---- Attributes ----
    labels: integer specifying the structure to which a given galaxy 
            has been assigned. It will have a -1 if no membership was 
            assigned.
            
    segments: sets of ra, dec coordinates for the MST branch segments
    seps: base-10 log of branch lengths (in degrees)
    
    """
    def __init__(self,
                 df,
                 cutoff_scale=None,
                 min_cluster_size=None,
                 n_neighbors=None,
                 set_mst=None,
                 labels=None,
                 segments=None,
                 seps=None):
        self.df = df
        self.cutoff_scale = cutoff_scale
        self.min_cluster_size = min_cluster_size
        self.n_neighbors = n_neighbors
        self.set_mst = MSTClustering(cutoff_scale=cutoff_scale,
                                     min_cluster_size=min_cluster_size,
                                     n_neighbors=n_neighbors)
        pos = np.array([list(i) for i in zip(df.ra, df.dec)])
        self.labels = self.set_mst.fit_predict(pos)
        self.segments = self.set_mst.get_graph_segments(full_graph=True)
        self.seps = self.get_sep_mst()

    """ Calculate branch lengths (in base-10 log(degrees)) 
        from the MST segments """

    def get_sep_mst(self):
        mst_coord0_ra = np.asarray(self.segments[0][0])
        mst_coord1_ra = np.asarray(self.segments[0][1])
        mst_coord0_dec = np.asarray(self.segments[1][0])
        mst_coord1_dec = np.asarray(self.segments[1][1])
        c0 = SkyCoord(mst_coord0_ra, mst_coord0_dec, unit=u.deg)
        c1 = SkyCoord(mst_coord1_ra, mst_coord1_dec, unit=u.deg)
        return np.log10(c0.separation(c1).degree)

    """ Plot the MST diagram (left) and the labeled structures 
        identified from the MST (right) """

    def plot_mst(self, model, cmap='rainbow', *args, **kwargs):
        """Utility code to visualize a minimum spanning tree"""
        xlim = kwargs.get('xlim', None)
        ylim = kwargs.get('ylim', None)
        ssize = kwargs.get('s', 8)
        savefigure = kwargs.get('savefigure', False)
        figname = kwargs.get('figname', 'MST_figure.png')
        X = model.X_fit_

        # One little hack to get more clear color differentiation between the
        # points with cluster membership and without. Add 50(?) to the label numbers
        # of those that are cluster members.
        model.labels_[model.labels_ > -1] += 50

        fig, ax = plt.subplots(1, 2, figsize=(20, 7), sharex=True, sharey=True)
        for axi, full_graph, colors in zip(ax, [True, False],
                                           ['lightblue', model.labels_]):
            segments = model.get_graph_segments(full_graph=full_graph)
            axi.plot(segments[0], segments[1], '-k', zorder=1, lw=1)
            plt.xlabel('Right Ascension (deg)', size=14)
            plt.ylabel('Declination (deg)', size=14)
            axi.scatter(X[:, 0],
                        X[:, 1],
                        c=colors,
                        cmap=cmap,
                        zorder=2,
                        s=ssize)
            axi.axis('tight')
            if xlim != None:
                plt.xlim(xlim)
            if ylim != None:
                plt.ylim(ylim)

        ax[0].set_title('Full Minimum Spanning Tree', size=16)
        ax[1].set_title('Trimmed Minimum Spanning Tree', size=16)

        # Leave an option to save all the plots to output PNG files.
        if savefigure == True:
            pl.savefig(figname, bbox_inches='tight', dpi=250)

    """ Plot the cumulative distribution of MST branch lengths """

    def plot_mst_cumul(self, *args, **kwargs):
        savefigure = kwargs.get('savefigure', False)
        figname = kwargs.get('figname', 'MST_cumul_dist.png')
        sns.distplot(self.seps,
                     hist_kws=dict(cumulative=False),
                     kde_kws=dict(cumulative=True))
        plt.xlabel('log$_{10}$ (MST branch length)', fontsize=15)
        plt.ylabel('Norm. Counts/Cumul. Dist.', fontsize=15)

        # Leave an option to save all the plots to output PNG files.
        if savefigure == True:
            pl.savefig(figname, bbox_inches='tight')