Exemplo n.º 1
0
    def transform(self, X):
        """Transform X to a cluster-distance space.

        In the new space, each dimension is the distance to the cluster
        centers.  Note that even if X is sparse, the array returned by
        `transform` will typically be dense.

        Parameters
        ----------

        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        Returns
        -------

        X_new : array, shape [n_samples, self.n_clusters_]
            X transformed in the new space.
        """
        check_is_fitted(self)
        if self._needs_normalization():
            X = normalize_rows(X)
        distances = np.hstack([
            dist.cdist(X[:, selector], centroid[np.newaxis, selector],
                       self.distance)
            for selector, centroid in zip(self.filters_, self.centroids_)
        ])
        return distances
Exemplo n.º 2
0
    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.

        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.

        Parameters
        ----------

        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------

        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self)
        if self._needs_normalization():
            X = normalize_rows(X)
        predict = partial(_predict_path, result=self.result_)
        with maybe_pool(self.n_jobs) as pool:
            paths = pool.map(predict, X)
        labels = [self.reverse_paths_[path] for path in paths]
        return np.array(labels, dtype=np.int32)
Exemplo n.º 3
0
def _sampled_dispersion(
    seed: int, sampler: BaseSampler, kmeans: KMeans, fit: bool = True
) -> float:
    logging.debug(f"Sampling with seed {seed}.")
    X = sampler.get_sample(seed)
    logging.debug(f"Sample shape {X.shape}")
    if getattr(kmeans, "normalize_rows", False):
        logging.debug("Normalizing rows.")
        X = normalize_rows(X)
    if fit:
        logging.debug("Fitting kmeans for sample.")
        y = kmeans.fit_predict(X)
    else:
        logging.debug("Predicting labels for sample.")
        y = kmeans.predict(X)
    logging.debug("Computing dispersion for clustered sample.")
    clusters = pd.DataFrame(X).groupby(y)
    return float(
        np.mean(
            [
                np.mean(dist.pdist(cluster_members.values, kmeans.distance))
                for _, cluster_members in clusters
                if cluster_members.shape[0] != 1
            ]
        )
    )
Exemplo n.º 4
0
    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.

        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.

        Parameters
        ----------

        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------

        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self)
        if self.normalize_rows:
            X = normalize_rows(X)
        labels = dst.cdist(X, self.cluster_centers_,
                           self.distance).argmin(axis=1)
        return labels
Exemplo n.º 5
0
 def __call__(self, data: Data,
              number_of_clusters: int) -> Tuple[IntLabels, Centroids]:
     _validate_kmeans_input(data, number_of_clusters)
     if number_of_clusters == 1:
         return (
             np.zeros((data.shape[0], 1), dtype=int),
             np.mean(data, axis=0, keepdims=True),
         )
     data = data.reshape(data.shape, order="C")
     if self.normalize_rows:
         _validate_normalizable(data)
         data = normalize_rows(data)
     label_set = np.arange(number_of_clusters)
     logging.debug("Initializing KMeans centroids.")
     centroids = self.initialize(data, number_of_clusters)
     logging.debug("First centroids found.")
     old_labels = np.nan * np.zeros((data.shape[0], ))
     labels = self.labeling(data, centroids)
     logging.debug("Labels assigned.")
     for _ in range(self.number_of_iterations):
         if np.unique(labels).size != number_of_clusters:
             centroids, labels = self._fix_labels(data, centroids, labels,
                                                  number_of_clusters)
         if np.all(labels == old_labels):
             logging.debug("Stability achieved.")
             break
         old_labels = labels
         centroids = redefine_centroids(data, old_labels, label_set,
                                        self.allow_dask)
         labels = self.labeling(data, centroids)
     return labels, centroids
Exemplo n.º 6
0
def _dispersion(data: Data, kmeans: KMeans) -> float:
    assert data.shape[0] == kmeans.labels_.size, "kmeans not fit on this data"
    if getattr(kmeans, "normalize_rows", False):
        data = normalize_rows(data)
    clusters = pd.DataFrame(data).groupby(kmeans.labels_)
    return float(
        np.mean(
            [
                np.mean(dist.pdist(cluster_members.values, kmeans.distance))
                for _, cluster_members in clusters
                if cluster_members.shape[0] != 1
            ]
        )
    )
Exemplo n.º 7
0
    def transform(self, X):
        """Transform X to a cluster-distance space.

        In the new space, each dimension is the distance to the cluster
        centers.  Note that even if X is sparse, the array returned by
        `transform` will typically be dense.

        Parameters
        ----------

        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to transform.

        Returns
        -------

        X_new : array, shape [n_samples, k]
            X transformed in the new space.

        """
        check_is_fitted(self)
        if self.normalize_rows:
            X = normalize_rows(X)
        return dst.cdist(X, self.cluster_centers_, self.distance)