示例#1
0
def clusterize(α, x, scale=None, labels=None):
    """
    Performs a simple 'voxelgrid' clustering on the input measure,
    putting points into cubic bins of size 'scale' = σ_c.
    The weights are summed, and the centroid position is that of the bin's center of mass.
    Most importantly, the "fine" lists of weights and points are *sorted*
    so that clusters are *contiguous in memory*: this allows us to perform
    kernel truncation efficiently on the GPU.

    If 
        [α_c, α], [x_c, x], [x_ranges] = clusterize(α, x, σ_c),
    then
        α_c[k], x_c[k] correspond to
        α[x_ranges[k,0]:x_ranges[k,1]], x[x_ranges[k,0]:x_ranges[k,1],:]
    """
    if labels is None and scale is None:  # No clustering, single-scale Sinkhorn on the way...
        return [α], [x], []

    else:  # As of today, only two-scale Sinkhorn is implemented:
        # Compute simple (voxel-like) class labels:
        x_lab = grid_cluster(x, scale) if labels is None else labels
        # Compute centroids and weights:
        ranges_x, x_c, α_c = cluster_ranges_centroids(x, x_lab, weights=α)
        # Make clusters contiguous in memory:
        (α, x), x_labels = sort_clusters((α, x), x_lab)

        return [α_c, α], [x_c, x], [ranges_x]
示例#2
0
    def _kneighbors(self, y):
        """
        Obtain the k nearest neighbors of the query dataset y
        """
        if self.__x is None:
            raise ValueError(
                "Input dataset not fitted yet! Call .fit() first!")
        if self.__device and self.tools.device(y) != self.__device:
            raise ValueError(
                "Input dataset and query dataset must be on same device")
        if len(y.shape) != 2:
            raise ValueError("Query dataset must be a 2D tensor")
        if self.__x.shape[-1] != y.shape[-1]:
            raise ValueError("Query and dataset must have same dimensions")
        if self.__normalise:
            y = y / self.tools.repeat(self.tools.norm(y, 2, -1),
                                      y.shape[1]).reshape(-1, y.shape[1])
        y = self.tools.contiguous(y)
        y_labels = self.__assign(y)

        y_ranges, _, _ = cluster_ranges_centroids(y, y_labels)
        self.__y_ranges = y_ranges
        y, y_labels = self.__sort_clusters(y, y_labels, store_x=False)
        x_LT = self.__LazyTensor(self.tools.unsqueeze(self.__x, 0))
        y_LT = self.__LazyTensor(self.tools.unsqueeze(y, 1))
        D_ij = self.__distance(y_LT, x_LT)
        ranges_ij = from_matrix(y_ranges, self.__x_ranges, self.__keep)
        D_ij.ranges = ranges_ij
        nn = D_ij.argKmin(K=self.__k, axis=1)
        return self.__unsort(nn)
示例#3
0
    def kneighbors(self, y):
        '''
    Obtain the k nearest neighbors of the query dataset y
    '''
        if self.__x is None:
            raise ValueError(
                'Input dataset not fitted yet! Call .fit() first!')
        if type(y) != torch.Tensor:
            raise ValueError("Query dataset must be a torch tensor")
        if y.device != self.__device:
            raise ValueError(
                'Input dataset and query dataset must be on same device')
        if len(y.shape) != 2:
            raise ValueError('Query dataset must be a 2D tensor')
        if self.__x.shape[-1] != y.shape[-1]:
            raise ValueError('Query and dataset must have same dimensions')
        if use_cuda:
            torch.cuda.synchronize()
        y = y.contiguous()
        y_labels = self.__assign(y)

        y_ranges, _, _ = cluster_ranges_centroids(y, y_labels)
        self.__y_ranges = y_ranges
        y, y_labels = self.__sort_clusters(y, y_labels, store_x=False)
        x_LT = LazyTensor(self.__x.unsqueeze(0).to(self.__device).contiguous())
        y_LT = LazyTensor(y.unsqueeze(1).to(self.__device).contiguous())
        D_ij = ((y_LT - x_LT)**2).sum(-1)

        ranges_ij = from_matrix(y_ranges, self.__x_ranges, self.__keep)
        D_ij.ranges = ranges_ij
        nn = D_ij.argKmin(K=self.__k, axis=1)
        return self.__unsort(nn)
示例#4
0
    def _Gauss_block_sparse_pre(self, x: torch.tensor, y: torch.tensor, K_ij: LazyTensor):
        '''
        Helper function to preprocess data for block-sparse reduction
        of the Gaussian kernel

        Args:
            x[np.array], y[np.array] = arrays giving rise to Gaussian kernel K(x,y)
            K_ij[LazyTensor_n] = symbolic representation of K(x,y)
            eps[float] = size for square bins
        Returns:
            K_ij[LazyTensor_n] = symbolic representation of K(x,y) with
                                set sparse ranges
        '''
        # labels for low dimensions

        if x.shape[1] < 4 or y.shape[1] < 4:

            x_labels = grid_cluster(x, self.eps)
            y_labels = grid_cluster(y, self.eps)
            # range and centroid per class
            x_ranges, x_centroids, _ = cluster_ranges_centroids(x, x_labels)
            y_ranges, y_centroids, _ = cluster_ranges_centroids(y, y_labels)
        else:
            # labels for higher dimensions

            x_labels, x_centroids = self._KMeans(x)
            y_labels, y_centroids = self._KMeans(y)
            # compute ranges
            x_ranges = cluster_ranges(x_labels)
            y_ranges = cluster_ranges(y_labels)

        # sort points
        x, x_labels = sort_clusters(x, x_labels)
        y, y_labels = sort_clusters(y, y_labels)
        # Compute a coarse Boolean mask:
        D = torch.sum((x_centroids[:, None, :] - y_centroids[None, :, :]) ** 2, 2)
        keep = D < (self.mask_radius) ** 2
        # mask -> set of integer tensors
        ranges_ij = from_matrix(x_ranges, y_ranges, keep)
        K_ij.ranges = ranges_ij  # block-sparsity pattern

        return K_ij
示例#5
0
    def _final_brute_force(self, nearest_clusters, query_pts):
        """ Final brute force search over clusters in cluster method"""
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        k = self.k

        x = self.data_orig.to(self.device)
        x_labels = self.clusters.long()
        y = query_pts.to(self.device)
        y_labels = nearest_clusters[:, 0]

        x = x.contiguous()
        y = y.contiguous()
        x_labels = x_labels.to(self.device)
        y_labels = y_labels.to(self.device)

        clusters, a = self.graph.shape
        r = torch.arange(clusters).repeat(a, 1).T.reshape(-1).long()
        keep = torch.zeros([clusters, clusters],
                           dtype=torch.bool).to(self.device)
        keep[r, self.graph.flatten()] = True
        keep += torch.eye(clusters).bool().to(self.device)

        x_ranges, x_centroids, _ = cluster_ranges_centroids(x, x_labels)
        y_ranges, y_centroids, _ = cluster_ranges_centroids(y, y_labels)

        x, x_labels = self.__sort_clusters(x, x_labels, store_x=True)
        y, y_labels = self.__sort_clusters(y, y_labels, store_x=False)

        x_LT = LazyTensor(x.unsqueeze(0).to(self.device).contiguous())
        y_LT = LazyTensor(y.unsqueeze(1).to(self.device).contiguous())
        D_ij = self.distance(y_LT, x_LT)

        x_ranges = x_ranges.to(self.device)
        y_ranges = y_ranges.to(self.device)
        ranges_ij = from_matrix(y_ranges, x_ranges, keep)
        D_ij.ranges = ranges_ij
        nn = D_ij.argKmin(K=k, axis=1)
        return self.__unsort(nn)
示例#6
0
    def kneighbors(self, y):
        if use_cuda:
            torch.cuda.synchronize()
        d = ((y.unsqueeze(1) - self.c.unsqueeze(0))**2).sum(-1)
        y_labels = torch.argmin(d, dim=1)

        y_ranges, _, _ = cluster_ranges_centroids(y, self.cl)

        y, y_labels = sort_clusters(y, y_labels)

        ranges_ij = from_matrix(self.x_ranges, y_ranges, self.keep)

        y_LT = LazyTensor(y.unsqueeze(0))
        D_ij = ((y_LT - self.x)**2).sum(-1)
        D_ij.ranges = ranges_ij
        return D_ij.argKmin(K=self.k, axis=1)
示例#7
0
    def fit(self, x, use_torch=True, clusters=50, a=5):

        cl, c = KMeans(x, clusters)

        self.c = c
        #update cluster assignment
        if use_torch:
            d = ((x.unsqueeze(1) - c.unsqueeze(0))**2).sum(-1)
            self.cl = torch.argmin(d, dim=1)
        else:
            self.cl = k_argmin(x, c)
        if use_cuda:
            torch.cuda.synchronize()

        #get KNN graph for the clusters
        if use_torch:

            self.ncl = k_argmin_torch(c, c, k=a)
        else:

            c1 = LazyTensor(c.unsqueeze(1))
            c2 = LazyTensor(c.unsqueeze(0))
            d = ((c1 - c2)**2).sum(-1)
            self.ncl = d.argKmin(K=a, dim=1)  #get a nearest clusters

        #get the ranges and centroids
        self.x_ranges, _, _ = cluster_ranges_centroids(x, self.cl)

        #

        x, x_labels = sort_clusters(x, self.cl)  #sort dataset to match ranges
        self.x = LazyTensor(x.unsqueeze(1))  #store dataset

        r = torch.arange(clusters).repeat(a, 1).T.reshape(-1).long()
        self.keep = torch.zeros([clusters, clusters], dtype=torch.bool)

        self.keep[r, self.ncl.flatten()] = True

        return self
示例#8
0
    def fit(self, x, clusters=50, a=5, n=15):
        '''
    Fits the main dataset
    '''
        if type(x) != torch.Tensor:
            raise ValueError('Input must be a torch tensor')
        if type(clusters) != int:
            raise ValueError('Clusters must be an integer')
        if clusters >= len(x):
            raise ValueError(
                'Number of clusters must be less than length of dataset')
        if type(a) != int:
            raise ValueError(
                'Number of clusters to search over must be an integer')
        if a > clusters:
            raise ValueError(
                'Number of clusters to search over must be less than total number of clusters'
            )
        if len(x.shape) != 2:
            raise ValueError('Input must be a 2D array')
        x = x.contiguous()
        self.__device = x.device
        cl, c = self.__KMeans(x, clusters, Niter=n)
        self.__c = c

        cl = self.__assign(x)
        if use_cuda:
            torch.cuda.synchronize()

        ncl = self.__k_argmin(c, c, k=a)
        self.__x_ranges, _, _ = cluster_ranges_centroids(x, cl)

        x, x_labels = self.__sort_clusters(x, cl, store_x=True)
        self.__x = x
        r = torch.arange(clusters).repeat(a, 1).T.reshape(-1).long()
        self.__keep = torch.zeros([clusters, clusters],
                                  dtype=torch.bool).to(self.__device)
        self.__keep[r, ncl.flatten()] = True
        return self
示例#9
0
def clusterize(α, x, scale=None, labels=None):
    """
    Performs a simple 'voxelgrid' clustering on the input measure,
    putting points into cubic bins of size 'scale' = σ_c.
    The weights are summed, and the centroid position is that of the bin's center of mass.
    Most importantly, the "fine" lists of weights and points are *sorted*
    so that clusters are *contiguous in memory*: this allows us to perform
    kernel truncation efficiently on the GPU.

    If
        [α_c, α], [x_c, x], [x_ranges] = clusterize(α, x, σ_c),
    then
        α_c[k], x_c[k] correspond to
        α[x_ranges[k,0]:x_ranges[k,1]], x[x_ranges[k,0]:x_ranges[k,1],:]
    """
    perm = None  # did we sort the point cloud at some point? Here's the permutation.

    if (labels is None and scale is
            None):  # No clustering, single-scale Sinkhorn on the way...
        return [α], [x], []

    else:  # As of today, only two-scale Sinkhorn is implemented:
        # Compute simple (voxel-like) class labels:
        x_lab = grid_cluster(x, scale) if labels is None else labels
        # Compute centroids and weights:
        ranges_x, x_c, α_c = cluster_ranges_centroids(x, x_lab, weights=α)
        # Make clusters contiguous in memory:
        x_labels, perm = torch.sort(x_lab.view(-1))
        α, x = α[perm], x[perm]

        # N.B.: the lines above were return to replace a call to
        #       'sort_clusters' which does not return the permutation,
        #       an information that is needed to de-permute the dual potentials
        #       if they are required by the user.
        # (α, x), x_labels = sort_clusters( (α,x), x_lab)

        return [α_c, α], [x_c, x], [ranges_x], perm
示例#10
0
x_labels = grid_cluster(x, eps)  # class labels
y_labels = grid_cluster(y, eps)  # class labels
if use_cuda:
    torch.cuda.synchronize()
end = time.time()
print("Perform clustering       : {:.4f}s".format(end - start))

###############################################
# Once (integer) cluster labels have been computed,
# we can compute the **centroids** and **memory footprint** of each class:

from pykeops.torch.cluster import cluster_ranges_centroids

# Compute one range and centroid per class:
start = time.time()
x_ranges, x_centroids, _ = cluster_ranges_centroids(x, x_labels)
y_ranges, y_centroids, _ = cluster_ranges_centroids(y, y_labels)
if use_cuda:
    torch.cuda.synchronize()
end = time.time()
print("Compute ranges+centroids : {:.4f}s".format(end - start))

###############################################
# Finally, we can **sort** our points according to their
# labels, making sure that **all clusters are stored contiguously in memory**:

from pykeops.torch.cluster import sort_clusters

start = time.time()
x, x_labels = sort_clusters(x, x_labels)
y, y_labels = sort_clusters(y, y_labels)
示例#11
0
def kernel_multiscale(α,
                      x,
                      β,
                      y,
                      blur=.05,
                      kernel=None,
                      name=None,
                      truncate=5,
                      diameter=None,
                      cluster_scale=None,
                      verbose=False,
                      **kwargs):

    if truncate is None or name == "energy":
        return kernel_online(α,
                             x,
                             β,
                             y,
                             blur=blur,
                             kernel=kernel,
                             truncate=truncate,
                             name=name,
                             **kwargs)

    # Renormalize our point cloud so that blur = 1:
    kernel, x, y = kernel_preprocess(kernel, name, x, y, blur)

    # Don't forget to normalize the diameter too!
    if cluster_scale is None:
        D = x.shape[-1]
        if diameter is None:
            diameter = max_diameter(x.view(-1, D), y.view(-1, D))
        else:
            diameter = diameter / blur
        cluster_scale = diameter / (np.sqrt(D) * 2000**(1 / D))

    # Put our points in cubic clusters:
    cell_diameter = cluster_scale * np.sqrt(x.shape[1])
    x_lab = grid_cluster(x, cluster_scale)
    y_lab = grid_cluster(y, cluster_scale)

    # Compute the ranges and centroids of each cluster:
    ranges_x, x_c, α_c = cluster_ranges_centroids(x, x_lab, weights=α)
    ranges_y, y_c, β_c = cluster_ranges_centroids(y, y_lab, weights=β)

    if verbose:
        print("{}x{} clusters, computed at scale = {:2.3f}".format(
            len(x_c), len(y_c), cluster_scale))

    # Sort the clusters, making them contiguous in memory:
    (α, x), x_lab = sort_clusters((α, x), x_lab)
    (β, y), y_lab = sort_clusters((β, y), y_lab)

    with torch.no_grad():  # Compute our block-sparse reduction ranges:
        # Compute pairwise distances between clusters:
        C_xx = squared_distances(x_c, x_c)
        C_yy = squared_distances(y_c, y_c)
        C_xy = squared_distances(x_c, y_c)

        # Compute the boolean masks:
        keep_xx = (C_xx <= (truncate + cell_diameter)**2)
        keep_yy = (C_yy <= (truncate + cell_diameter)**2)
        keep_xy = (C_xy <= (truncate + cell_diameter)**2)

        # Compute the KeOps reduction ranges:
        ranges_xx = from_matrix(ranges_x, ranges_x, keep_xx)
        ranges_yy = from_matrix(ranges_y, ranges_y, keep_yy)
        ranges_xy = from_matrix(ranges_x, ranges_y, keep_xy)

    return kernel_keops(kernel,
                        α,
                        x,
                        β,
                        y,
                        ranges_xx=ranges_xx,
                        ranges_yy=ranges_yy,
                        ranges_xy=ranges_xy)
示例#12
0
    a_i.requires_grad = True
    x_i.requires_grad = True
    b_j.requires_grad = True

    # Compute the loss + gradients:
    Loss_xy = Loss(a_i, x_i, b_j, y_j)
    [F_i, G_j, dx_i] = grad(Loss_xy, [a_i, b_j, x_i])

    #  The generalized "Brenier map" is (minus) the gradient of the Sinkhorn loss
    # with respect to the Wasserstein metric:
    BrenierMap = -dx_i / (a_i.view(-1, 1) + 1e-7)

    # Compute the coarse measures for display ----------------------------------

    x_lab = grid_cluster(x_i, cluster_scale)
    _, x_c, a_c = cluster_ranges_centroids(x_i, x_lab, weights=a_i)

    y_lab = grid_cluster(y_j, cluster_scale)
    _, y_c, b_c = cluster_ranges_centroids(y_j, y_lab, weights=b_j)

    # Fancy display: -----------------------------------------------------------

    ax = plt.subplot(((Nits - 1) // 3 + 1), 3, i + 1)
    ax.scatter([10], [10])  # shameless hack to prevent a slight change of axis...

    display_potential(ax, G_j, "#E2C5C5")
    display_potential(ax, F_i, "#C8DFF9")

    if blur > cluster_scale:
        display_samples(ax, y_j, b_j, [(0.55, 0.55, 0.95, 0.2)])
        display_samples(ax, x_i, a_i, [(0.95, 0.55, 0.55, 0.2)], v=BrenierMap)
示例#13
0
    def _fit(
        self,
        x,
        clusters=50,
        a=5,
        Niter=15,
        device=None,
        backend=None,
        approx=False,
        n=50,
    ):
        """
        Fits the main dataset
        """
        if type(clusters) != int:
            raise ValueError("Clusters must be an integer")
        if clusters >= len(x):
            raise ValueError(
                "Number of clusters must be less than length of dataset")
        if type(a) != int:
            raise ValueError(
                "Number of clusters to search over must be an integer")
        if a > clusters:
            raise ValueError(
                "Number of clusters to search over must be less than total number of clusters"
            )
        if len(x.shape) != 2:
            raise ValueError("Input must be a 2D array")
        if self.__normalise:
            x = x / self.tools.repeat(self.tools.norm(x, 2, -1),
                                      x.shape[1]).reshape(-1, x.shape[1])

        # if we want to use the approximation in Kmeans, and our metric is angular, switch to full angular metric
        if approx and self.__metric == "angular":
            self.__update_metric("angular_full")

        x = self.tools.contiguous(x)
        self.__device = device
        self.__backend = backend

        cl, c = self.tools.kmeans(
            x,
            self.__distance,
            clusters,
            Niter=Niter,
            device=self.__device,
            approx=approx,
            n=n,
        )

        self.__c = c
        cl = self.__assign(x)

        ncl = self.__k_argmin(c, c, k=a)
        self.__x_ranges, _, _ = cluster_ranges_centroids(x, cl)

        x, x_labels = self.__sort_clusters(x, cl, store_x=True)
        self.__x = x
        r = self.tools.repeat(
            self.tools.arange(clusters, device=self.__device), a)
        self.__keep = self.tools.to(
            self.tools.zeros([clusters, clusters], dtype=bool), self.__device)
        self.__keep[r, ncl.flatten()] = True

        return self
示例#14
0
def kernel_multiscale(α,
                      x,
                      β,
                      y,
                      blur=0.05,
                      kernel=None,
                      name=None,
                      truncate=5,
                      diameter=None,
                      cluster_scale=None,
                      potentials=False,
                      verbose=False,
                      **kwargs):

    if truncate is None or name == "energy":
        return kernel_online(α.unsqueeze(0),
                             x.unsqueeze(0),
                             β.unsqueeze(0),
                             y.unsqueeze(0),
                             blur=blur,
                             kernel=kernel,
                             truncate=truncate,
                             name=name,
                             potentials=potentials,
                             **kwargs)

    # Renormalize our point cloud so that blur = 1:
    # Center the point clouds just in case, to prevent numeric overflows:
    center = (x.mean(-2, keepdim=True) + y.mean(-2, keepdim=True)) / 2
    x, y = x - center, y - center
    x_ = x / blur
    y_ = y / blur

    # Don't forget to normalize the diameter too!
    if cluster_scale is None:
        D = x.shape[-1]
        if diameter is None:
            diameter = max_diameter(x_.view(-1, D), y_.view(-1, D))
        else:
            diameter = diameter / blur
        cluster_scale = diameter / (np.sqrt(D) * 2000**(1 / D))

    # Put our points in cubic clusters:
    cell_diameter = cluster_scale * np.sqrt(x_.shape[-1])
    x_lab = grid_cluster(x_, cluster_scale)
    y_lab = grid_cluster(y_, cluster_scale)

    # Compute the ranges and centroids of each cluster:
    ranges_x, x_c, α_c = cluster_ranges_centroids(x_, x_lab, weights=α)
    ranges_y, y_c, β_c = cluster_ranges_centroids(y_, y_lab, weights=β)

    if verbose:
        print("{}x{} clusters, computed at scale = {:2.3f}".format(
            len(x_c), len(y_c), cluster_scale))

    # Sort the clusters, making them contiguous in memory:
    (α, x), x_lab = sort_clusters((α, x), x_lab)
    (β, y), y_lab = sort_clusters((β, y), y_lab)

    with torch.no_grad():  # Compute our block-sparse reduction ranges:
        # Compute pairwise distances between clusters:
        C_xx = squared_distances(x_c, x_c)
        C_yy = squared_distances(y_c, y_c)
        C_xy = squared_distances(x_c, y_c)

        # Compute the boolean masks:
        keep_xx = C_xx <= (truncate + cell_diameter)**2
        keep_yy = C_yy <= (truncate + cell_diameter)**2
        keep_xy = C_xy <= (truncate + cell_diameter)**2

        # Compute the KeOps reduction ranges:
        ranges_xx = from_matrix(ranges_x, ranges_x, keep_xx)
        ranges_yy = from_matrix(ranges_y, ranges_y, keep_yy)
        ranges_xy = from_matrix(ranges_x, ranges_y, keep_xy)

    return kernel_loss(
        α,
        x,
        β,
        y,
        blur=blur,
        kernel=kernel,
        name=name,
        potentials=potentials,
        use_keops=True,
        ranges_xx=ranges_xx,
        ranges_yy=ranges_yy,
        ranges_xy=ranges_xy,
    )