def test_labels_assignment_and_inertia():
    # pure numpy implementation as easily auditable reference gold
    # implementation
    rng = np.random.RandomState(42)
    noisy_centers = centers + rng.normal(size=centers.shape)
    labels_gold = - np.ones(n_samples, dtype=np.int)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)
    for center_id in range(n_clusters):
        dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
        labels_gold[dist < mindist] = center_id
        mindist = np.minimum(dist, mindist)
    inertia_gold = mindist.sum()
    assert_true((mindist >= 0.0).all())
    assert_true((labels_gold != -1).all())

    # perform label assignment using the dense array input
    x_squared_norms = (X ** 2).sum(axis=1)
    labels_array, inertia_array = _labels_inertia(
        X, x_squared_norms, noisy_centers)
    assert_array_almost_equal(inertia_array, inertia_gold)
    assert_array_equal(labels_array, labels_gold)

    # perform label assignment using the sparse CSR input
    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
    labels_csr, inertia_csr = _labels_inertia(
        X_csr, x_squared_norms_from_csr, noisy_centers)
    assert_array_almost_equal(inertia_csr, inertia_gold)
    assert_array_equal(labels_csr, labels_gold)
Пример #2
0
def test_labels_assignment_and_inertia():
    # pure numpy implementation as easily auditable reference gold
    # implementation
    rng = np.random.RandomState(42)
    noisy_centers = centers + rng.normal(size=centers.shape)
    labels_gold = np.full(n_samples, -1, dtype=np.int)
    mindist = np.empty(n_samples)
    mindist.fill(np.infty)
    for center_id in range(n_clusters):
        dist = np.sum((X - noisy_centers[center_id])**2, axis=1)
        labels_gold[dist < mindist] = center_id
        mindist = np.minimum(dist, mindist)
    inertia_gold = mindist.sum()
    assert (mindist >= 0.0).all()
    assert (labels_gold != -1).all()

    sample_weight = None

    # perform label assignment using the dense array input
    x_squared_norms = (X**2).sum(axis=1)
    labels_array, inertia_array = _labels_inertia(X, sample_weight,
                                                  x_squared_norms,
                                                  noisy_centers)
    assert_array_almost_equal(inertia_array, inertia_gold)
    assert_array_equal(labels_array, labels_gold)

    # perform label assignment using the sparse CSR input
    x_squared_norms_from_csr = row_norms(X_csr, squared=True)
    labels_csr, inertia_csr = _labels_inertia(X_csr, sample_weight,
                                              x_squared_norms_from_csr,
                                              noisy_centers)
    assert_array_almost_equal(inertia_csr, inertia_gold)
    assert_array_equal(labels_csr, labels_gold)
def test__labels_constrained_kmeans_parity():
    X = np.array([
        [0, 0],
        [1, 2],
        [1, 4],
        [1, 0],
        [4, 2],
        [4, 4],
        [4, 0],
        [4, 4]
    ]).astype('float')
    centers = np.array([
        [0, 0],
        [4, 4]
    ]).astype('float')
    size_min, size_max = 0, len(X)  # No restrictions and so should be the same as K-means

    x_squared_norms = row_norms(X, squared=True)

    distances_constrained = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
    labels_constrained, inertia_constrained = _labels_constrained(X, centers, size_min, size_max, distances_constrained)

    distances_kmeans = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
    labels_kmeans, inertia_kmeans = \
        _labels_inertia(X, x_squared_norms, centers, precompute_distances=False, distances=distances_kmeans)

    assert_array_equal(labels_constrained, labels_kmeans)
    assert_almost_equal(distances_constrained, distances_kmeans)
    assert inertia_constrained == inertia_kmeans
Пример #4
0
def predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self, 'cluster_centers_')

    X = self._check_test_data(X)

    daal_ready = sample_weight is None and hasattr(
        X, '__array__')  # or sp.isspmatrix_csr(X)

    if daal_ready:
        return _daal4py_k_means_dense(X, self.n_clusters, 0, 0.0,
                                      self.cluster_centers_, 1, None)[1]
    else:
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, sample_weight, x_squared_norms,
                               self.cluster_centers_)[0]
Пример #5
0
 def fit_predict_score(self, X, weights, init, maxIter=1000):
     self.init_fit(X,weights,init)
     for i in range(maxIter):
         self._e_step()
         self._m_step()
     labels , base_inertia = _labels_inertia(self.X,self.x_squared_norms,self.ukList)
     # inertia =  base_inertia + len(self.X) * self.K * self.X.shape[1] * self.PENALIZATION_CLUSTER
     inertia = 2 * np.log(base_inertia) - np.log(len(self.X)) * self.K
     return labels, - inertia, self.ukList
Пример #6
0
 def inertie(self,uk):
     _ , base_inertie = _labels_inertia(self.X, self.x_squared_norms, uk, precompute_distances=True)
     # s = 0
     # for u in uk:
     #     s += np.square(uk - u).sum(axis=1).sum()
     s = set([s for u in uk for s in np.square(uk - u).sum(axis=1)])
     if len(s) == 1:
         s = 0
     else:
         s.discard(0)
         s = min(s)
     base_inertie -= s/200
     return base_inertie
def Subspace_iter(X, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, x_squared_norms=None, random_state=None):
    random_state = check_random_state(random_state)
    centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms)

    new_labels, new_inertia, new_centers = None, None, None

    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)
    d_shape = X.shape[1]
    randomval = random_state.random_sample(d_shape ** 2).reshape(d_shape, d_shape)
    V_val, _ = np.linalg.qr(randomval, mode='complete')
    m_val = d_shape // 2
    S_D = np.dot(X.T, X)
    P_Cluster = np.eye(m_val, M=d_shape).T
    for i in range(max_iter):
        centers_old = centers.copy()
        X_values = np.dot(np.dot(X, V_val), P_Cluster)
        centers_c = np.dot(np.dot(centers, V_val), P_Cluster)
        labels, _ = pairwise_distances_argmin_min(X = X_values, Y = centers_c,  metric='euclidean',metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)
        centers = _k_means._centers_dense(X, labels, n_clusters, distances)
        S = np.zeros((d_shape, d_shape))
        for it in range(n_clusters):
            X_it = X[:][labels == it] - centers[:][it]
            S += np.dot(X_it.T, X_it)
        Sigma = S - S_D
        EV, _ = np.linalg.eigh(Sigma)
        m = len(np.where(EV < tol_eig)[0])
        P_Cluster = np.eye(m, M=d_shape).T
        inertia = 0.0
        for j in range(n_clusters):
            inertia += row_norms( X[:][labels == j] - centers[:][j],squared=True).sum()

        if new_inertia is None or inertia < new_inertia:
            new_labels = labels.copy()
            new_centers = centers.copy()
            new_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            break

    if center_shift_total > 0:
        new_labels, new_inertia = _labels_inertia(X, x_squared_norms, new_centers,
                            precompute_distances=False,
                            distances=distances)
    return new_labels, new_inertia, new_centers, i + 1
Пример #8
0
    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.

        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            New data to predict.

        Returns
        -------
        labels : array, shape [n_samples,]
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self, 'cluster_centers_')

        # X = self._check_test_data(X)
        x_squared_norms = row_norms(X, squared=True)
        return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
Пример #9
0
    def _mini_batch_step(self, X, x_squared_norms, X_weighted, weights,
                         centers, counts, distances):

        nearest_center, inertia = _labels_inertia(X,
                                                  np.ones(X.shape[0]),
                                                  x_squared_norms,
                                                  centers,
                                                  distances=distances)
        loss = 4 * np.sum((centers[nearest_center] - X)**2, axis=1)

        k = centers.shape[0]
        for center_idx in range(k):

            center_mask = nearest_center == center_idx
            count = (center_mask * weights).sum()

            if count > 0:
                centers[center_idx] *= counts[center_idx]
                centers[center_idx] += np.sum(X_weighted[center_mask], axis=0)
                counts[center_idx] += count
                centers[center_idx] /= counts[center_idx]

        return inertia, loss
Пример #10
0
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
                                   init='k-means++', verbose=False,
                                   x_squared_norms=None,
                                   random_state=None, tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
                      % (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Пример #11
0
def _spherical_kmeans_single_lloyd(X,
                                   n_clusters,
                                   max_iter=300,
                                   init='k-means++',
                                   verbose=False,
                                   x_squared_norms=None,
                                   random_state=None,
                                   tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Пример #12
0
    def partial_fit(self, D):
        """
    	Apply one iteration of VR_MBKM
    	
    	Input: self, dataset 
    	Output: self
    	
    	Updated:
    	   -self.curr_iter
    	   -self.curr_inner_iter
    	   -self.tot_inner_iter
    	   -self.cluster_centers_
		"""
        ## perform checks on dataset
        D = check_array(D, accept_sparse='csr')

        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        if self.curr_inner_iter == 0:
            self.inner_loop == 0

        if self.curr_iter == 0 or self.inner_loop == 0 or self.update_freq == 0:
            ## OUTER LOOP
            # use the entire dataset
            X = D
            x_squared_norms = row_norms(X, squared=True)
            self.random_state_ = getattr(self, "random_state_",
                                         check_random_state(self.random_state))

            if self.curr_iter == 0:
                ## initialize centers
                if hasattr(self.init, '__array__'):
                    self.cluster_centers_ = self.init
                else:
                    self.cluster_centers_ = k_means_._init_centroids(
                        X,
                        self.n_clusters,
                        self.init,
                        random_state=self.random_state_,
                        x_squared_norms=x_squared_norms,
                        init_size=self.init_size)

                _, cost = k_means_._labels_inertia(X, x_squared_norms,
                                                   self.cluster_centers_)
                #print "Cost of current initial centers on the mini-batch is %r " % cost

                ## initialize counts
                self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)

            ## this ensures the benchmark centers are either the seeds
            ## or obtained from the last iterate of inner loop
            self.benchmark_centers = self.cluster_centers_.copy()

            ## run Lloyd's update with entire data
            distances = np.zeros(X.shape[0], dtype=np.float64)
            self.benchmark_updates, _, self.squared_diff = _kmeans_step(
                X=X,
                x_squared_norms=x_squared_norms,
                centers=self.benchmark_centers.copy(),
                distances=distances,
                precompute_distances=self.precompute_distances,
                n_clusters=self.n_clusters)

            self.cluster_centers_ = self.benchmark_updates.copy()
            self.curr_outer_iter += 1
            self.inner_loop = 1

        else:
            ## INNER LOOP:
            # use a mini-batch of data
            sample_idx = random.sample(range(D.shape[0]), self.mbsize)
            X = D[sample_idx, :]
            #x_squared_norms = row_norms(X, squared=True)
            self.set_eta()
            ## run VRMB_step with entire data
            distances = np.zeros(X.shape[0], dtype=np.float64)

            self.cluster_centers_, self.squared_diff, _ = VR_MB_step(
                X,
                None,
                self.cluster_centers_.copy(),
                self.benchmark_centers.copy(),
                self.benchmark_updates.copy(),
                self.counts_,
                self.curr_iter,
                np.zeros(0, np.double),
                0,
                distances,
                random_reassign=False,
                random_state=self.random_state_,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose,
                learn_rate=self.set_eta())

            # increment inner loop counts
            self.curr_inner_iter = (self.curr_inner_iter +
                                    1) % self.update_freq

        # increment global loop count
        self.curr_iter += 1
Пример #13
0
def test_minibatch_update_consistency():
    """Check that dense and sparse minibatch update give the same results"""
    rng = np.random.RandomState(42)
    old_centers = centers + rng.normal(size=centers.shape)

    new_centers = old_centers.copy()
    new_centers_csr = old_centers.copy()

    counts = np.zeros(new_centers.shape[0], dtype=np.int32)
    counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32)

    x_squared_norms = (X**2).sum(axis=1)
    x_squared_norms_csr = csr_row_norm_l2(X_csr, squared=True)

    buffer = np.zeros(centers.shape[1], dtype=np.double)
    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]

    # step 1: compute the dense minibatch update
    old_inertia, incremental_diff = _mini_batch_step(X_mb, x_mb_squared_norms,
                                                     new_centers, counts,
                                                     buffer, 1)
    assert_true(old_inertia > 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms,
                                          new_centers)
    assert_true(new_inertia > 0.0)
    assert_true(new_inertia < old_inertia)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers - old_centers)**2)
    assert_almost_equal(incremental_diff, effective_diff)

    # step 2: compute the sparse minibatch update
    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
        X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr,
        buffer_csr, 1)
    assert_true(old_inertia_csr > 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr,
                                                  x_mb_squared_norms_csr,
                                                  new_centers_csr)
    assert_true(new_inertia_csr > 0.0)
    assert_true(new_inertia_csr < old_inertia_csr)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers_csr - old_centers)**2)
    assert_almost_equal(incremental_diff_csr, effective_diff)

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_array_almost_equal(new_centers, new_centers_csr)
    assert_almost_equal(incremental_diff, incremental_diff_csr)
    assert_almost_equal(old_inertia, old_inertia_csr)
    assert_almost_equal(new_inertia, new_inertia_csr)
Пример #14
0
    def sub_kmeans_single_(self, X, sample_weight, x_squared_norms, tol,
                           random_state):
        random_state = check_random_state(random_state)
        sample_weight = _check_sample_weight(X, sample_weight)
        best_labels, best_inertia, best_centers = None, None, None

        distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)
        centers = _init_centroids(X,
                                  self.n_clusters,
                                  init='k-means++',
                                  random_state=random_state,
                                  x_squared_norms=x_squared_norms)

        d = X.shape[1]  # dimentionality of original space
        m = d // 2  # dimentionality of clustered space
        SD = np.dot(X.T,
                    X)  # scatter matrix of the dataset in the original space

        # orthonormal matrix of a rigid transformation
        V, _ = np.linalg.qr(random_state.random_sample(d**2).reshape(d, d),
                            mode='complete')
        for i in range(self.max_iter):
            centers_old = centers.copy()

            # get the clusters' labels
            labels = self.assignment_step_(X=X, V=V, centers=centers, m=m)

            # compute new centers and sum the clusters' scatter matrices
            centers = _k_means._centers_dense(X, sample_weight, labels,
                                              self.n_clusters, distances)
            S = self.update_step_(X, centers, labels)

            # sorted eigenvalues and eigenvectors of SIGMA=S-SD
            V, m = self.eigen_decomposition_(S - SD)
            if m == 0:
                raise ValueError('Might be a single cluster (m = 0).')

            # inertia - sum of squared distances of samples to their closest cluster center
            inertia = sum([
                row_norms(X[labels == j] - centers[j], squared=True).sum()
                for j in range(self.n_clusters)
            ])

            # print("Iteration %2d, inertia %.3f" % (i, inertia))
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia

            center_shift_total = squared_norm(centers_old - centers)
            if center_shift_total <= tol:
                # print("Converged at iteration %d: center shift %e within tolerance %e" % (i, center_shift_total, tol))
                break

        if center_shift_total > 0:
            # rerun E-step in case of non-convergence so that predicted labels match cluster centers
            best_labels, best_inertia = _labels_inertia(
                X,
                sample_weight,
                x_squared_norms,
                best_centers,
                precompute_distances=False,
                distances=distances)

        return best_centers, best_labels, best_inertia
Пример #15
0
def _k_means_minus_minus(
    X,
    sample_weight,
    n_clusters,
    prop_outliers,
    max_iter=300,
    init="k-means++",
    verbose=False,
    x_squared_norms=None,
    random_state=None,
    tol=1e-4,
    precompute_distances=True,
):
    """A single run of k-means, assumes preparation completed prior.

    Parameters
    ----------
    X : array-like of floats, shape (n_samples, n_features)
        The observations to cluster.

    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    prop_outliers : float
        What proportion of the training dataset X to treat as outliers, and
        to exclude in each iteration of Lloyd's algorithm.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (k, p) and gives
        the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    verbose : boolean, optional
        Verbosity mode

    x_squared_norms : array
        Precomputed x_squared_norms.

    precompute_distances : boolean, default: True
        Precompute distances (faster but takes more memory).

    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.

    Returns
    -------
    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    n_iter : int
        Number of iterations run.
    """

    n_outliers = int(X.shape[0] * prop_outliers)
    random_state = check_random_state(random_state)

    sample_weight = _check_normalize_sample_weight(sample_weight, X)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment is also called the E-step of EM
        labels, inertia = _labels_inertia(
            X,
            sample_weight,
            x_squared_norms,
            centers,
            precompute_distances=precompute_distances,
            distances=distances,
        )

        # the "minus-minus" modification step - filter out n_outliers # of
        # datapoints that are farthest from their assigned cluster centers
        X_subset, sample_weight_subset, labels_subset, distances_subset = (
            X,
            sample_weight,
            labels,
            distances,
        )
        if n_outliers > 0:
            outlier_indices = np.argpartition(
                distances,
                -n_outliers)[-n_outliers:]  # ~20x faster than np.argsort()

            X_subset, sample_weight_subset, labels_subset, distances_subset = (
                np.delete(X, outlier_indices, axis=0),
                np.delete(sample_weight, outlier_indices, axis=0),
                np.delete(labels, outlier_indices, axis=0),
                np.delete(distances, outlier_indices, axis=0),
            )

            # indices_to_refit = np.argsort(distances) < (X.shape[0] - n_outliers)
        # X_subset, sample_weight_subset = X[indices_to_refit], sample_weight[indices_to_refit]

        # computation of the means is also called the M-step of EM
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X_subset, sample_weight_subset,
                                               labels_subset, n_clusters,
                                               distances_subset)
        else:
            centers = _k_means._centers_dense(X_subset, sample_weight_subset,
                                              labels_subset, n_clusters,
                                              distances_subset)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = _labels_inertia(
            X,
            sample_weight,
            x_squared_norms,
            best_centers,
            precompute_distances=precompute_distances,
            distances=distances,
        )

    return best_labels, best_inertia, best_centers, i + 1
Пример #16
0
    def mbkmean(self, options, n_clusters, n_init, batch_size, n_iter,
                n_samples, labels_true, k_means, X):
        #to do with online MBK_mean
        #Compute clustering with MiniBatchKMeans
        mbk = cluster.MiniBatchKMeans(init=self.init,
                                      n_clusters=n_clusters,
                                      batch_size=batch_size,
                                      n_init=10,
                                      max_no_improvement=n_iter,
                                      verbose=0)

        #INIT THREADs
        try:
            if options[2] == '-pp' or options[3] == '-pp':
                thread_1 = afficheur('starting threads', labels_true, mbk,
                                     k_means, X, n_clusters)
                thread_1.start()
        except IndexError:
            pass

        try:
            if options[2] == '-s':

                #init state
                n_batches = int(np.ceil(float(n_samples) / batch_size))
                max_iter = 100

                tol = 0
                _, n_features = X.shape
                old_center_buffer = np.zeros(n_features, dtype=X.dtype)
                random_state = check_random_state(None)
                init_size = 3 * batch_size
                if init_size > n_samples:
                    init_size = n_samples

                validation_indices = random_state.randint(
                    0, n_samples, init_size)
                X_valid = X[validation_indices]
                x_squared_norms = row_norms(X, squared=True)
                x_squared_norms_valid = x_squared_norms[validation_indices]
                counts = np.zeros(n_clusters, dtype=np.int32)
                best_inertia = None
                cluster_centers = None

                for init_idx in range(n_init):

                    cluster_centers = cluster._init_centroids(
                        X,
                        n_clusters,
                        self.init,
                        random_state=random_state,
                        x_squared_norms=x_squared_norms,
                        init_size=init_size)
                    batch_inertia, centers_squared_diff = cluster._mini_batch_step(
                        X_valid,
                        x_squared_norms[validation_indices],
                        cluster_centers,
                        counts,
                        old_center_buffer,
                        False,
                        distances=None,
                        verbose=False)
                    _, inertia = cluster._labels_inertia(
                        X_valid, x_squared_norms_valid, cluster_centers)
                    if best_inertia is None or inertia < best_inertia:
                        mbk.cluster_centers_ = cluster_centers
                        mbk.counts_ = counts
                        best_inertia = inertia
                        print('best inertia %d' % best_inertia)

                while (True):
                    thread_1 = afficheur('starting threads', labels_true, mbk,
                                         k_means, X, n_clusters)
                    thread_1.start()
                    t0 = time.time()

                    for iteration_idx in range(n_iter):
                        minibatch_indices = random_state.randint(
                            0, n_samples, batch_size)
                        mbk = mbk.partial_fit(X[minibatch_indices])
                        thread_1.update(mbk)

                    t_mini_batch = time.time() - t0
                    thread_1.stop()
                    thread_1.join()

                    n_iter = self.input_num("Iterations suivante : ")

                    if n_iter == "stop":
                        return mbk, t_mini_batch
                        break
                    if isinstance(n_iter, int) == False:
                        print('error integer is required !!! type %s' %
                              type(n_iter))
                        break

        except IndexError:
            pass

        try:
            if options[2] == '-pp':

                random_state = check_random_state(None)
                t0 = time.time()
                # Sample a minibatch from the full dataset
                for iteration_idx in range(n_iter - 1):
                    minibatch_indices = random_state.randint(
                        0, n_samples, batch_size)
                    mbk = mbk.partial_fit(X[minibatch_indices])

                    thread_1.update(mbk)
                t_mini_batch = time.time() - t0
                thread_1.stop()
                thread_1.join()
                return mbk, t_mini_batch

        except IndexError:
            pass

        try:
            if options[2] == '-p':

                random_state = check_random_state(None)
                t0 = time.time()
                for iteration_idx in range(n_iter):
                    minibatch_indices = random_state.randint(
                        0, n_samples, batch_size)
                    mbk = mbk.partial_fit(X[minibatch_indices])

                t_mini_batch = time.time() - t0
                return mbk, t_mini_batch

        except IndexError:
            pass

        try:
            if options[2] == '-n':
                t0 = time.time()
                mbk = mbk.fit(X)
                t_mini_batch = time.time() - t0
                return mbk, t_mini_batch

        except IndexError:
            pass

        try:
            if options[2] == None:
                random_state = check_random_state(None)
                # Sample a minibatch from the full dataset
                t0 = time.time()
                for iteration_idx in range(n_iter - 1):
                    minibatch_indices = random_state.randint(
                        0, n_samples, self.batch_size)
                    mbk = mbk.partial_fit(X,
                                          minibatch_indices=minibatch_indices)
                t_mini_batch = time.time() - t0
                return mbk, t_mini_batch
        except IndexError:
            pass

        try:
            if options[2] == '-o':
                n_batches = int(np.ceil(float(n_samples) / batch_size))
                max_iter = 100

                n_iter = int(max_iter * n_batches)
                tol = 0
                _, n_features = X.shape
                old_center_buffer = np.zeros(n_features, dtype=X.dtype)
                try:
                    #  print('self.max_iter %d , n_batches %d '%(n_iter,n_batches))
                    if options[3] == '-pp':
                        #init state

                        random_state = check_random_state(None)
                        init_size = 3 * batch_size

                        if init_size > n_samples:
                            init_size = n_samples

                        validation_indices = random_state.randint(
                            0, n_samples, init_size)
                        X_valid = X[validation_indices]
                        x_squared_norms = row_norms(X, squared=True)
                        x_squared_norms_valid = x_squared_norms[
                            validation_indices]
                        counts = np.zeros(n_clusters, dtype=np.int32)
                        best_inertia = None
                        cluster_centers = None

                        #Random init with minimum inertia
                        for init_idx in range(n_init):
                            cluster_centers = cluster._init_centroids(
                                X,
                                n_clusters,
                                self.init,
                                random_state=random_state,
                                x_squared_norms=x_squared_norms,
                                init_size=init_size)
                            batch_inertia, centers_squared_diff = cluster._mini_batch_step(
                                X_valid,
                                x_squared_norms[validation_indices],
                                cluster_centers,
                                counts,
                                old_center_buffer,
                                False,
                                distances=None,
                                verbose=False)
                            _, inertia = cluster._labels_inertia(
                                X_valid, x_squared_norms_valid,
                                cluster_centers)
                            if best_inertia is None or inertia < best_inertia:
                                mbk.cluster_centers_ = cluster_centers
                                mbk.counts_ = counts
                                best_inertia = inertia
                                print('best inertia %d' % best_inertia)

                        convergence_context = {}
                        mbk.batch_inertia = batch_inertia
                        mbk.centers_squared_diff = centers_squared_diff
                        t0 = time.time()
                        for iteration_idx in range(n_iter):
                            minibatch_indices = random_state.randint(
                                0, n_samples, batch_size)
                            mbk = mbk.partial_fit(X[minibatch_indices])
                            tol = self._tolerance(X, tol)
                            thread_1.update(mbk)

                            # Monitor convergence and do early stopping if necessary
                            if cluster._mini_batch_convergence(
                                    mbk,
                                    iteration_idx,
                                    n_iter,
                                    tol,
                                    n_samples,
                                    mbk.centers_squared_diff,
                                    mbk.batch_inertia,
                                    convergence_context,
                                    verbose=mbk.verbose):
                                t_mini_batch = time.time() - t0
                                thread_1.stop()
                                thread_1.join()
                                return mbk, t_mini_batch
                                break

                    elif options[3] == '-p':
                        random_state = check_random_state(None)
                        convergence_context = {}
                        t0 = time.time()
                        for iteration_idx in range(n_iter):
                            minibatch_indices = random_state.randint(
                                0, n_samples, batch_size)
                            mbk = mbk.partial_fit(X[minibatch_indices])
                            tol = self._tolerance(X, tol)

                            # Monitor convergence and do early stopping if necessary
                            if cluster._mini_batch_convergence(
                                    mbk,
                                    iteration_idx,
                                    n_iter,
                                    tol,
                                    n_samples,
                                    mbk.centers_squared_diff,
                                    mbk.batch_inertia,
                                    convergence_context,
                                    verbose=False):
                                t_mini_batch = time.time() - t0
                                return mbk, t_mini_batch
                                break
                except IndexError:
                    pass
        except IndexError:
            pass
Пример #17
0
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)
    old_centers = centers + rng.normal(size=centers.shape)

    new_centers = old_centers.copy()
    new_centers_csr = old_centers.copy()

    weight_sums = np.zeros(new_centers.shape[0], dtype=np.double)
    weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double)

    x_squared_norms = (X**2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    buffer = np.zeros(centers.shape[1], dtype=np.double)
    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]

    sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double)

    # step 1: compute the dense minibatch update
    old_inertia, incremental_diff = _mini_batch_step(X_mb,
                                                     sample_weight_mb,
                                                     x_mb_squared_norms,
                                                     new_centers,
                                                     weight_sums,
                                                     buffer,
                                                     1,
                                                     None,
                                                     random_reassign=False)
    assert_greater(old_inertia, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb,
                                          x_mb_squared_norms, new_centers)
    assert_greater(new_inertia, 0.0)
    assert_less(new_inertia, old_inertia)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers - old_centers)**2)
    assert_almost_equal(incremental_diff, effective_diff)

    # step 2: compute the sparse minibatch update
    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
        X_mb_csr,
        sample_weight_mb,
        x_mb_squared_norms_csr,
        new_centers_csr,
        weight_sums_csr,
        buffer_csr,
        1,
        None,
        random_reassign=False)
    assert_greater(old_inertia_csr, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, sample_weight_mb,
                                                  x_mb_squared_norms_csr,
                                                  new_centers_csr)
    assert_greater(new_inertia_csr, 0.0)
    assert_less(new_inertia_csr, old_inertia_csr)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers_csr - old_centers)**2)
    assert_almost_equal(incremental_diff_csr, effective_diff)

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_array_almost_equal(new_centers, new_centers_csr)
    assert_almost_equal(incremental_diff, incremental_diff_csr)
    assert_almost_equal(old_inertia, old_inertia_csr)
    assert_almost_equal(new_inertia, new_inertia_csr)
    def partial_fit(self, X, y=None, sample_weight=None):
        """Update k means estimate on a single mini-batch X.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Coordinates of the data points to cluster. It must be noted that
            X will be copied if it is not C-contiguous.
        y : Ignored
            Not used, present here for API consistency by convention.
        sample_weight : array-like, shape (n_samples,), optional
            The weights for each observation in X. If None, all observations
            are assigned equal weight (default: None).
        Returns
        -------
        self
        """

        X = check_array(X,
                        accept_sparse="csr",
                        order="C",
                        dtype=[np.float64, np.float32])
        n_samples, n_features = X.shape
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=X.dtype)

        if n_samples == 0:
            return self

        # unit-normalize for spherical k-means
        X = normalize(X)

        sample_weight = _check_normalize_sample_weight(sample_weight, X)

        x_squared_norms = row_norms(X, squared=True)
        self.random_state_ = getattr(self, "random_state_",
                                     check_random_state(self.random_state))
        if (not hasattr(self, 'counts_')
                or not hasattr(self, 'cluster_centers_')):
            # this is the first call partial_fit on this object:
            # initialize the cluster centers
            self.cluster_centers_ = _init_centroids(
                X,
                self.n_clusters,
                self.init,
                random_state=self.random_state_,
                x_squared_norms=x_squared_norms,
                init_size=self.init_size)

            self.counts_ = np.zeros(self.n_clusters, dtype=sample_weight.dtype)
            random_reassign = False
            distances = None
        else:
            # The lower the minimum count is, the more we do random
            # reassignment, however, we don't want to do random
            # reassignment too often, to allow for building up counts
            random_reassign = self.random_state_.randint(
                10 * (1 + self.counts_.min())) == 0
            distances = np.zeros(X.shape[0], dtype=X.dtype)

        self.cluster_centers_ = normalize(self.cluster_centers_)

        _mini_batch_spherical_step(X,
                                   sample_weight,
                                   x_squared_norms,
                                   self.cluster_centers_,
                                   self.counts_,
                                   np.zeros(0, dtype=X.dtype),
                                   0,
                                   random_reassign=random_reassign,
                                   distances=distances,
                                   random_state=self.random_state_,
                                   reassignment_ratio=self.reassignment_ratio,
                                   verbose=self.verbose)
        self.cluster_centers_ = normalize(self.cluster_centers_)

        if self.compute_labels:
            self.labels_, self.inertia_ = _labels_inertia(
                X, sample_weight, x_squared_norms, self.cluster_centers_)

        return self
def _mini_batch_spherical_step(X,
                               sample_weight,
                               x_squared_norms,
                               centers,
                               weight_sums,
                               old_center_buffer,
                               compute_squared_diff,
                               distances,
                               random_reassign=False,
                               random_state=None,
                               reassignment_ratio=.01,
                               verbose=False):
    """Incremental update of the centers for the Minibatch K-Means algorithm.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The original data array.
    sample_weight : array-like, shape (n_samples,)
        The weights for each observation in X.
    x_squared_norms : array, shape (n_samples,)
        Squared euclidean norm of each data point.
    centers : array, shape (k, n_features)
        The cluster centers. This array is MODIFIED IN PLACE
    counts : array, shape (k,)
         The vector in which we keep track of the numbers of elements in a
         cluster. This array is MODIFIED IN PLACE
    distances : array, dtype float, shape (n_samples), optional
        If not None, should be a pre-allocated array that will be used to store
        the distances of each sample to its closest center.
        May not be None when random_reassign is True.
    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization and to
        pick new clusters amongst observations with uniform probability. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.
    random_reassign : boolean, optional
        If True, centers with very low counts are randomly reassigned
        to observations.
    reassignment_ratio : float, optional
        Control the fraction of the maximum number of counts for a
        center to be reassigned. A higher value means that low count
        centers are more likely to be reassigned, which means that the
        model will take longer to converge, but should converge in a
        better clustering.
    verbose : bool, optional, default False
        Controls the verbosity.
    compute_squared_diff : bool
        If set to False, the squared diff computation is skipped.
    old_center_buffer : int
        Copy of old centers for monitoring convergence.
    Returns
    -------
    inertia : float
        Sum of squared distances of samples to their closest cluster center.
    squared_diff : numpy array, shape (n_clusters,)
        Squared distances between previous and updated cluster centers.
    """
    # Perform label assignment to nearest centers
    nearest_center, inertia = _labels_inertia(X,
                                              sample_weight,
                                              x_squared_norms,
                                              centers,
                                              distances=distances)

    if random_reassign and reassignment_ratio > 0:
        random_state = check_random_state(random_state)
        # Reassign clusters that have very low weight
        to_reassign = weight_sums < reassignment_ratio * weight_sums.max()
        # pick at most .5 * batch_size samples as new centers
        if to_reassign.sum() > .5 * X.shape[0]:
            indices_dont_reassign = \
                np.argsort(weight_sums)[int(.5 * X.shape[0]):]
            to_reassign[indices_dont_reassign] = False
        n_reassigns = to_reassign.sum()
        if n_reassigns:
            # Pick new clusters amongst observations with uniform probability
            new_centers = random_state.choice(X.shape[0],
                                              replace=False,
                                              size=n_reassigns)
            if verbose:
                print("[MiniBatchKMeans] Reassigning %i cluster centers." %
                      n_reassigns)

            if sp.issparse(X) and not sp.issparse(centers):
                assign_rows_csr(
                    X, new_centers.astype(np.intp, copy=False),
                    np.where(to_reassign)[0].astype(np.intp, copy=False),
                    centers)
            else:
                centers[to_reassign] = X[new_centers]
        # reset counts of reassigned centers, but don't reset them too small
        # to avoid instant reassignment. This is a pretty dirty hack as it
        # also modifies the learning rates.
        weight_sums[to_reassign] = np.min(weight_sums[~to_reassign])

    # implementation for the sparse CSR representation completely written in
    # cython
    if sp.issparse(X):
        return inertia, _mini_batch_update_csr(X, sample_weight,
                                               x_squared_norms, centers,
                                               weight_sums, nearest_center,
                                               old_center_buffer,
                                               compute_squared_diff)

    # dense variant in mostly numpy (not as memory efficient though)
    k = centers.shape[0]
    squared_diff = 0.0
    for center_idx in range(k):
        # find points from minibatch that are assigned to this center
        center_mask = nearest_center == center_idx
        wsum = sample_weight[center_mask].sum()

        if wsum > 0:
            if compute_squared_diff:
                old_center_buffer[:] = centers[center_idx]

            # inplace remove previous count scaling
            centers[center_idx] *= weight_sums[center_idx]

            # inplace sum with new points members of this cluster
            centers[center_idx] += \
                np.sum(X[center_mask] *
                       sample_weight[center_mask, np.newaxis], axis=0)

            # unit-normalize for spherical k-means
            centers[center_idx] = normalize(centers[center_idx, None])[:, 0]

            # update the squared diff if necessary
            if compute_squared_diff:
                diff = centers[center_idx].ravel() - old_center_buffer.ravel()
                squared_diff += np.dot(diff, diff)

    return inertia, squared_diff
Пример #20
0
    def partial_fit(self, X, y=None):
        """Override partial_fit() in MiniBatchKMeans class 
           (Jan-16: added a return var: squared_diff)
           (April-16: changed set_eta as an internal step)
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Coordinates of the data points to cluster.
        """

        X = check_array(X, accept_sparse="csr")
        n_samples, n_features = X.shape
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        if n_samples == 0:
            return self

        x_squared_norms = row_norms(X, squared=True)
        self.random_state_ = getattr(self, "random_state_",
                                     check_random_state(self.random_state))
        if (not hasattr(self, 'counts_')
                or not hasattr(self, 'cluster_centers_')):
            # this is the first call partial_fit on this object:
            # initialize the cluster centers
            # pdb.set_trace()
            if hasattr(self.init, '__array__'):
                self.cluster_centers_ = self.init

            else:
                self.cluster_centers_ = k_means_._init_centroids(
                    X,
                    self.n_clusters,
                    self.init,
                    random_state=self.random_state_,
                    x_squared_norms=x_squared_norms,
                    init_size=self.init_size)

            _, cost = k_means_._labels_inertia(X, x_squared_norms,
                                               self.cluster_centers_)
            print "Cost of current initial centers on the mini-batch is %r " % cost

            self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)
            #random_reassign = False
            distances = None
            self.curr_iter = 1
        else:
            # The lower the minimum count is, the more we do random
            # reassignment, however, we don't want to do random
            # reassignment too often, to allow for building up counts
            #random_reassign = self.random_state_.randint(
            #    10 * (1 + self.counts_.min())) == 0
            distances = np.zeros(X.shape[0], dtype=np.float64)
            """ modification HERE  """
            #self.set_eta()
            self.cluster_centers_, self.squared_diff, _ = MB_step(
                X,
                x_squared_norms,
                self.cluster_centers_,
                self.counts_,
                self.curr_iter,
                np.zeros(0, np.double),
                0,
                random_reassign=False,
                distances=distances,
                random_state=self.random_state_,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose,
                learn_rate=self.set_eta())
            self.curr_iter = self.curr_iter + 1
        if self.compute_labels:
            self.labels_, self.inertia_ = k_means_._labels_inertia(
                X, x_squared_norms, self.cluster_centers_)

        return self
Пример #21
0
def _kmeans_step(X, x_squared_norms, centers,
                    distances,
                    precompute_distances,
                    n_clusters,
                    random_state=None):
    """Incremental update of the centers for the Minibatch K-Means algorithm.
        Parameters
        ----------
        X : array, shape (n_samples, n_features)
        The original data array.
        x_squared_norms : array, shape (n_samples,)
        Squared euclidean norm of each data point.
        centers : array, shape (k, n_features)
        The cluster centers. This array is MODIFIED IN PLACE
        distances : array, dtype float64, shape (n_samples), optional
        If not None, should be a pre-allocated array that will be used to store
        the distances of each sample to its closest center.
        May not be None when random_reassign is True.
        random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
        
        Returns
        -------
        inertia : float
        Sum of distances of samples to their closest cluster center.
        squared_diff : numpy array, shape (n_clusters,)
        Squared distances between previous and updated cluster centers.
    """
    centers_old = centers.copy()
    # labels assignment is also called the E-step of EM
    labels, inertia = k_means_._labels_inertia(X, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    # computation of the means is also called the M-step of EM
    if sp.issparse(X):
        centers = _k_means._centers_sparse(X, labels, n_clusters,
                                               distances)
    else:
        
        centers = _k_means._centers_dense(X, labels, n_clusters, distances)
    """       if best_inertia is None or inertia < best_inertia:
              best_labels = labels.copy()
              best_centers = centers.copy()
              best_inertia = inertia
    """
    shift = squared_norm(centers_old - centers)
    """        if shift <= tol:
            if verbose:
                print("Converged at iteration %d" % i)

            break

    if shift > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
    """                                                                                                
    return centers,inertia, shift
def subspace_kmeans_single(X,
                           sample_weight,
                           n_clusters,
                           init='k-means++',
                           max_iter=300,
                           tol=1e-4,
                           tol_eig=-1e-10,
                           verbose=False,
                           x_squared_norms=None,
                           random_state=None):
    random_state = check_random_state(random_state)
    sample_weight = _check_sample_weight(X, sample_weight)

    best_labels, best_inertia, best_centers = None, None, None
    # init
    centers = _init_centroids(X,
                              n_clusters,
                              init,
                              random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype)

    # === Beginning of original implementation of initialization ===

    # Dimensionality of original space
    d = X.shape[1]

    # Set initial V as QR-decomposed Q of random matrix
    rand_vals = random_state.random_sample(d**2).reshape(d, d)
    V, _ = np.linalg.qr(rand_vals, mode='complete')

    # Set initial m as d/2
    m = d // 2

    # Scatter matrix of the dataset in the original space
    S_D = np.dot(X.T, X)

    # Projection onto the first m attributes
    P_C = np.eye(m, M=d).T

    # === End of original implementation of initialization ===

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # === Beginning of original implementation of E-step of EM ===

        X_C = np.dot(np.dot(X, V), P_C)
        mu_C = np.dot(np.dot(centers, V), P_C)
        labels, _ = pairwise_distances_argmin_min(
            X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True})
        labels = labels.astype(np.int32)

        # === End of original implementation of E-step of EM ===

        # computation of the means is also called the M-step of EM
        centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters,
                                          distances)

        # === Beginning of original implementation of M-step of EM ===

        S = np.zeros((d, d))
        for i in range(n_clusters):
            X_i = X[:][labels == i] - centers[:][i]
            S += np.dot(X_i.T, X_i)
        Sigma = S - S_D
        evals, evecs = np.linalg.eigh(Sigma)
        idx = np.argsort(evals)[::1]
        V = evecs[:, idx]
        m = len(np.where(evals < tol_eig)[0])
        if m == 0:
            raise ValueError(
                'Dimensionality of clustered space is 0. '
                'The dataset is better explained by a single cluster.')
        P_C = np.eye(m, M=d).T
        inertia = 0.0
        for i in range(n_clusters):
            inertia += row_norms(X[:][labels == i] - centers[:][i],
                                 squared=True).sum()

        # === End of original implementation of M-step of EM ===

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e" %
                      (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, sample_weight,x_squared_norms, best_centers,
                            precompute_distances=False,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Пример #23
0
def MB_step(X,
            x_squared_norms,
            centers,
            counts,
            curr_iter,
            old_center_buffer,
            compute_squared_diff,
            distances,
            random_reassign=False,
            random_state=None,
            reassignment_ratio=.01,
            verbose=False,
            learn_rate=0.0):
    """Incremental update of the centers for the Minibatch K-Means algorithm.
    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The original data array.
    x_squared_norms : array, shape (n_samples,)
        Squared euclidean norm of each data point.
    centers : array, shape (k, n_features)
        The cluster centers. This array is MODIFIED IN PLACE
    counts : array, shape (k,)
         The vector in which we keep track of the numbers of elements in a
         cluster. This array is MODIFIED IN PLACE
    distances : array, dtype float64, shape (n_samples), optional
        If not None, should be a pre-allocated array that will be used to store
        the distances of each sample to its closest center.
        May not be None when random_reassign is True.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    random_reassign : boolean, optional
        If True, centers with very low counts are randomly reassigned
        to observations.
    reassignment_ratio : float, optional
        Control the fraction of the maximum number of counts for a
        center to be reassigned. A higher value means that low count
        centers are more likely to be reassigned, which means that the
        model will take longer to converge, but should converge in a
        better clustering.
    verbose : bool, optional, default False
        Controls the verbosity.
    compute_squared_diff : bool
        If set to False, the squared diff computation is skipped.
    old_center_buffer : int
        Copy of old centers for monitoring convergence.
    
    learn_rate: learning rate
    
    Returns
    -------
    centers: 
    	Updated centers
    inertia : float
        Sum of distances of samples to their closest cluster center.
    squared_diff : numpy array, shape (n_clusters,)
        Squared distances between previous and updated cluster centers.
    """
    # Perform label assignment to nearest centers
    nearest_center, inertia = k_means_._labels_inertia(X,
                                                       x_squared_norms,
                                                       centers,
                                                       distances=distances)

    if random_reassign and reassignment_ratio > 0:
        random_state = check_random_state(random_state)
        # Reassign clusters that have very low counts
        to_reassign = counts < reassignment_ratio * counts.max()
        # pick at most .5 * batch_size samples as new centers
        if to_reassign.sum() > .5 * X.shape[0]:
            indices_dont_reassign = np.argsort(counts)[int(.5 * X.shape[0]):]
            to_reassign[indices_dont_reassign] = False
        n_reassigns = to_reassign.sum()
        if n_reassigns:
            # Pick new clusters amongst observations with uniform probability
            new_centers = choice(X.shape[0],
                                 replace=False,
                                 size=n_reassigns,
                                 random_state=random_state)
            if verbose:
                print("[MiniBatchKMeans] Reassigning %i cluster centers." %
                      n_reassigns)

            if sp.issparse(X) and not sp.issparse(centers):
                assign_rows_csr(X, astype(new_centers, np.intp),
                                astype(np.where(to_reassign)[0], np.intp),
                                centers)
            else:
                centers[to_reassign] = X[new_centers]
        # reset counts of reassigned centers, but don't reset them too small
        # to avoid instant reassignment. This is a pretty dirty hack as it
        # also modifies the learning rates.
        counts[to_reassign] = np.min(counts[~to_reassign])

    squared_diff = 0.0
    ## implementation for the sparse CSR representation completely written in
    # cython
    if sp.issparse(X):
        if compute_squared_diff:
            old_center_buffer = centers
        #rand_vec = make_rand_vector(X.shape[1])
        #learn_rate = 0.0
        centers = _MB_step._mini_batch_update_csr(X, x_squared_norms, centers,
                                                  counts, nearest_center,
                                                  old_center_buffer,
                                                  compute_squared_diff,
                                                  curr_iter, learn_rate)

        if compute_squared_diff:
            diff = centers - old_center_buffer
            squared_diff = row_norms(diff, squared=True).sum()

        return centers, squared_diff, inertia

    ## dense variant in mostly numpy (not as memory efficient though)
    k = centers.shape[0]
    for center_idx in range(k):
        # find points from minibatch that are assigned to this center
        center_mask = nearest_center == center_idx
        old_count = counts[center_idx]
        this_count = center_mask.sum()
        counts[center_idx] += this_count  # update counts

        if this_count > 0:
            new_count = counts[center_idx]
            if compute_squared_diff:
                old_center_buffer[:] = centers[center_idx]

            # inplace remove previous count scaling
            #centers[center_idx] *= counts[center_idx]

            # inplace sum with new points members of this cluster
            #centers[center_idx] += np.sum(X[center_mask], axis=0)

            # update the count statistics for this center
            #counts[center_idx] += count

            # inplace rescale to compute mean of all points (old and new)
            #centers[center_idx] /= counts[center_idx]
            new_center = np.sum(X[center_mask], axis=0)
            if learn_rate == 0.0:
                learn_rate = (new_count - old_count) / float(new_count)

            centers[center_idx] = centers[center_idx] + learn_rate * (
                new_center / (new_count - old_count) - centers[center_idx])

            # update the squared diff if necessary
            if compute_squared_diff:
                diff = centers[center_idx].ravel() - old_center_buffer.ravel()
                squared_diff += np.dot(diff, diff)

    return centers, squared_diff, inertia
Пример #24
0
 def func(dat_matrix):
     x_squared_norms = row_norms(dat_matrix, squared=True)
     inertias = _labels_inertia(dat_matrix, x_squared_norms,
                                km.cluster_centers_)[1]
     return inertias
Пример #25
0
    def fit(self, X, y=None):
        """Compute the centroids on X by chunking it into mini-batches.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored

        """
        random_state = check_random_state(self.random_state)
        X = check_array(X,
                        accept_sparse="csr",
                        order='C',
                        dtype=[np.float64, np.float32])
        n_samples, n_features = X.shape
        if n_samples < self.n_clusters:
            raise ValueError("Number of samples smaller than number "
                             "of clusters.")

        n_init = self.n_init
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
            if n_init != 1:
                warnings.warn(
                    'Explicit initial center position passed: '
                    'performing only one init in MiniBatchKMeans instead of '
                    'n_init=%d' % self.n_init,
                    RuntimeWarning,
                    stacklevel=2)
                n_init = 1

        x_squared_norms = k_means_.row_norms(X, squared=True)

        if self.tol > 0.0:
            tol = k_means_._tolerance(X, self.tol)

            # using tol-based early stopping needs the allocation of a
            # dedicated before which can be expensive for high dim data:
            # hence we allocate it outside of the main loop
            old_center_buffer = np.zeros(n_features, dtype=X.dtype)
        else:
            tol = 0.0
            # no need for the center buffer if tol-based early stopping is
            # disabled
            old_center_buffer = np.zeros(0, dtype=X.dtype)

        distances = np.zeros(self.batch_size, dtype=X.dtype)
        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        n_iter = int(self.max_iter * n_batches)

        init_size = self.init_size
        if init_size is None:
            init_size = 3 * self.batch_size
        if init_size > n_samples:
            init_size = n_samples
        self.init_size_ = init_size

        validation_indices = random_state.randint(0, n_samples, init_size)
        X_valid = X[validation_indices]
        x_squared_norms_valid = x_squared_norms[validation_indices]

        # perform several inits with random sub-sets
        best_inertia = None
        for init_idx in range(n_init):
            if self.verbose:
                print("Init %d/%d with method: %s" %
                      (init_idx + 1, n_init, self.init))
            counts = np.zeros(self.n_clusters, dtype=np.int32)

            # TODO: once the `k_means` function works with sparse input we
            # should refactor the following init to use it instead.

            # Initialize the centers using only a fraction of the data as we
            # expect n_samples to be very large when using MiniBatchKMeans
            cluster_centers = k_means_._init_centroids(
                X,
                self.n_clusters,
                self.init,
                random_state=random_state,
                x_squared_norms=x_squared_norms,
                init_size=init_size)

            # Compute the label assignment on the init dataset
            batch_inertia, centers_squared_diff = k_means_._mini_batch_step(
                X_valid,
                x_squared_norms[validation_indices],
                cluster_centers,
                counts,
                old_center_buffer,
                False,
                distances=None,
                verbose=self.verbose)

            # Keep only the best cluster centers across independent inits on
            # the common validation set
            _, inertia = k_means_._labels_inertia(X_valid,
                                                  x_squared_norms_valid,
                                                  cluster_centers)
            if self.verbose:
                print("Inertia for init %d/%d: %f" %
                      (init_idx + 1, n_init, inertia))
            if best_inertia is None or inertia < best_inertia:
                self.cluster_centers_ = cluster_centers
                self.counts_ = counts
                best_inertia = inertia

        # Empty context to be used inplace by the convergence check routine
        convergence_context = {}

        # Perform the iterative optimization until the final convergence
        # criterion
        for iteration_idx in range(n_iter):
            # Sample a minibatch from the full dataset
            minibatch_indices = random_state.randint(0, n_samples,
                                                     self.batch_size)

            # Perform the actual update step on the minibatch data
            batch_inertia, centers_squared_diff = k_means_._mini_batch_step(
                X[minibatch_indices],
                x_squared_norms[minibatch_indices],
                self.cluster_centers_,
                self.counts_,
                old_center_buffer,
                tol > 0.0,
                distances=distances,
                # Here we randomly choose whether to perform
                # random reassignment: the choice is done as a function
                # of the iteration index, and the minimum number of
                # counts, in order to force this reassignment to happen
                # every once in a while
                random_reassign=((iteration_idx + 1) %
                                 (10 + self.counts_.min()) == 0),
                random_state=random_state,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose)

            # Monitor convergence and do early stopping if necessary
            if k_means_._mini_batch_convergence(self,
                                                iteration_idx,
                                                n_iter,
                                                tol,
                                                n_samples,
                                                centers_squared_diff,
                                                batch_inertia,
                                                convergence_context,
                                                verbose=self.verbose):
                break

        self.n_iter_ = iteration_idx + 1

        if self.compute_labels:
            self.labels_, self.inertia_ = self._labels_inertia_minibatch(X)

        return self
Пример #26
0
 def _e_step(self):
     labels , _ = _labels_inertia(self.X, self.x_squared_norms, self.ukList, precompute_distances=True)
     self.rkn = np.array([labels == k for k in range(self.K)])
def test_minibatch_update_consistency():
    # Check that dense and sparse minibatch update give the same results
    rng = np.random.RandomState(42)
    old_centers = centers + rng.normal(size=centers.shape)

    new_centers = old_centers.copy()
    new_centers_csr = old_centers.copy()

    counts = np.zeros(new_centers.shape[0], dtype=np.int32)
    counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32)

    x_squared_norms = (X ** 2).sum(axis=1)
    x_squared_norms_csr = row_norms(X_csr, squared=True)

    buffer = np.zeros(centers.shape[1], dtype=np.double)
    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]

    # step 1: compute the dense minibatch update
    old_inertia, incremental_diff = _mini_batch_step(
        X_mb, x_mb_squared_norms, new_centers, counts,
        buffer, 1, None, random_reassign=False)
    assert_greater(old_inertia, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(
        X_mb, x_mb_squared_norms, new_centers)
    assert_greater(new_inertia, 0.0)
    assert_less(new_inertia, old_inertia)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers - old_centers) ** 2)
    assert_almost_equal(incremental_diff, effective_diff)

    # step 2: compute the sparse minibatch update
    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
        X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr,
        buffer_csr, 1, None, random_reassign=False)
    assert_greater(old_inertia_csr, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(
        X_mb_csr, x_mb_squared_norms_csr, new_centers_csr)
    assert_greater(new_inertia_csr, 0.0)
    assert_less(new_inertia_csr, old_inertia_csr)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
    assert_almost_equal(incremental_diff_csr, effective_diff)

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_array_almost_equal(new_centers, new_centers_csr)
    assert_almost_equal(incremental_diff, incremental_diff_csr)
    assert_almost_equal(old_inertia, old_inertia_csr)
    assert_almost_equal(new_inertia, new_inertia_csr)
Пример #28
0
## X = np.random.normal(size=(n_samples, n_features))

tol = 1e-4

## print("\n-- scipy.cluster.vq")
## ratio = 1.
## np.random.seed(random_state)
## sc, _ = utils.timeit(profile(kmeans))(X, n_clusters, iter=2,
##                                       thresh=tol / ratio)
## ## utils.cache_value(sc, 'prof_kmeans/scipy_kmeans_%d_%d'
## ##                   % (n_samples, n_features))
## inertia1 = _labels_inertia(X, (X ** 2).sum(axis=-1), sc)[1]
## print('scipy inertia: %.1f' % np.sqrt(inertia1))

print("\n-- sklearn.cluster")
ratio = 1. #np.mean(np.var(X, axis=0))  # just to make the comparison fair.

np.random.seed(random_state)
sk, _, _ = utils.timeit(profile(k_means))(X, n_clusters, n_init=2,
                                          tol=tol / ratio,
                                          init="random",
                                          random_state=random_state)
## utils.cache_value(sk, 'prof_kmeans/sklearn_kmeans_%d_%d' %
##                   (n_samples, n_features))
inertia2 = _labels_inertia(X, (X ** 2).sum(axis=-1), sk)[1]
print('inertia: %.1f' % np.sqrt(inertia2))

## print ('\nsklearn - scipy inertia: %.1f. Relative variation: %.1e' %
##        ((inertia2 - inertia1), (inertia2 - inertia1) / (
##            2. * (inertia1 + inertia2))))