예제 #1
0
def _labels_inertia_skl(X,
                        sample_weight,
                        x_squared_norms,
                        centers,
                        precompute_distances=True,
                        distances=None):
    """E step of the K-means EM algorithm.
    Compute the labels and the inertia of the given samples and centers.
    This will compute the distances in-place.

    :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
        The input samples to assign to the labels.
    :param sample_weight: array-like, shape (n_samples,)
        The weights for each observation in X.
    :param x_squared_norms: array, shape (n_samples,)
        Precomputed squared euclidean norm of each data point, to speed up
        computations.
    :param centers: float array, shape (k, n_features)
        The cluster centers.
    :param precompute_distances: boolean, default: True
        Precompute distances (faster but takes more memory).
    :param distances: float array, shape (n_samples,)
        Pre-allocated array to be filled in with each sample's distance
        to the closest center.
    :return: labels, int array of shape(n)
        The resulting assignment
    :return: inertia, float
        Sum of squared distances of samples to their closest cluster center.
    """
    n_samples = X.shape[0]
    sample_weight = _check_sample_weight(sample_weight, X)
    # set the default value of centers to -1 to be able to detect any anomaly
    # easily
    labels = numpy.full(n_samples, -1, numpy.int32)
    if distances is None:
        distances = numpy.zeros(shape=(0, ), dtype=X.dtype)
    # distances will be changed in-place
    if issparse(X):
        inertia = _assign_labels_csr(X,
                                     sample_weight,
                                     x_squared_norms,
                                     centers,
                                     labels,
                                     distances=distances)
    else:
        if precompute_distances:
            return _labels_inertia_precompute_dense(
                norm='l2',
                X=X,
                sample_weight=sample_weight,
                centers=centers,
                distances=distances)
        inertia = _assign_labels_array(X,
                                       sample_weight,
                                       x_squared_norms,
                                       centers,
                                       labels,
                                       distances=distances)
    return labels, inertia
예제 #2
0
def _labels_inertia(norm,
                    X,
                    sample_weight,
                    centers,
                    precompute_distances=True,
                    distances=None):
    """
    E step of the K-means EM algorithm.

    Computes the labels and the inertia of the given samples and centers.
    This will compute the distances in-place.

    :param norm: 'l1' or 'l2'
    :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
        The input samples to assign to the labels.
    :param sample_weight: array-like, shape (n_samples,)
        The weights for each observation in X.
    :param centers: float array, shape (k, n_features)
        The cluster centers.
    :param precompute_distances: boolean, default: True
        Precompute distances (faster but takes more memory).
    :param distances: existing distances
    :return: labels : int array of shape(n)
        The resulting assignment
    :return: inertia : float
        Sum of squared distances of samples to their closest cluster center.
    """
    if norm == 'l2':
        return _labels_inertia_skl(X,
                                   sample_weight=sample_weight,
                                   centers=centers,
                                   precompute_distances=precompute_distances,
                                   x_squared_norms=None)

    sample_weight = _check_sample_weight(sample_weight, X)
    # set the default value of centers to -1 to be able to detect any anomaly
    # easily
    if distances is None:
        distances = numpy.zeros(shape=(0, ), dtype=X.dtype)
    # distances will be changed in-place
    if issparse(X):
        raise NotImplementedError(  # pragma no cover
            "Sparse matrix is not implemented for norm 'l1'.")
    if precompute_distances:
        return _labels_inertia_precompute_dense(norm=norm,
                                                X=X,
                                                sample_weight=sample_weight,
                                                centers=centers,
                                                distances=distances)
    raise NotImplementedError(  # pragma no cover
        "precompute_distances is False, not implemented for norm 'l1'.")
예제 #3
0
def _kmeans_single_lloyd(norm,
                         X,
                         sample_weight,
                         n_clusters,
                         max_iter=300,
                         init='k-means++',
                         verbose=False,
                         random_state=None,
                         tol=1e-4):
    """
    A single run of k-means, assumes preparation completed prior.

    :param norm: 'L1' or 'L2'
    :param X: array-like of floats, shape (n_samples, n_features)
        The observations to cluster.
    :param n_clusters: int
        The number of clusters to form as well as the number of
        centroids to generate.
    :param sample_weight: array-like, shape (n_samples,)
        The weights for each observation in X.
    :param max_iter: int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.
    :param init: {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    :param tol: float, optional
        The relative increment in the results before declaring convergence.
    :param verbose: boolean, optional
        Verbosity mode
    :param random_state: int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.
    :return: centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.
    :return: label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.
    :return: inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).
    :return: n_iter : int
        Number of iterations run.
    """
    random_state = check_random_state(random_state)

    sample_weight = _check_sample_weight(sample_weight, X)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(norm,
                              X,
                              n_clusters,
                              init,
                              random_state=random_state)
    if verbose:  # pragma no cover
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = numpy.zeros(shape=(X.shape[0], ), dtype=X.dtype)
    X_sort_index = numpy.argsort(X, axis=0)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()
        # labels assignment is also called the E-step of EM
        labels, inertia = _labels_inertia(norm,
                                          X,
                                          sample_weight,
                                          centers,
                                          distances=distances)

        # computation of the means is also called the M-step of EM
        centers = _centers_dense(X, sample_weight, labels, n_clusters,
                                 distances, X_sort_index)

        if verbose:  # pragma no cover
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = numpy.sum(
            numpy.abs(centers_old - centers).ravel())
        if center_shift_total <= tol:
            if verbose:  # pragma no cover
                print("Converged at iteration %d: "
                      "center shift %r within tolerance %r" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = _labels_inertia(norm,
                                                    X,
                                                    sample_weight,
                                                    best_centers,
                                                    distances=distances)

    return best_labels, best_inertia, best_centers, i + 1