def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation rng = np.random.RandomState(42) noisy_centers = centers + rng.normal(size=centers.shape) labels_gold = np.full(n_samples, -1, dtype=np.int) mindist = np.empty(n_samples) mindist.fill(np.infty) for center_id in range(n_clusters): dist = np.sum((X - noisy_centers[center_id])**2, axis=1) labels_gold[dist < mindist] = center_id mindist = np.minimum(dist, mindist) inertia_gold = mindist.sum() assert (mindist >= 0.0).all() assert (labels_gold != -1).all() sample_weight = None # perform label assignment using the dense array input x_squared_norms = (X**2).sum(axis=1) labels_array, inertia_array = _labels_inertia(X, sample_weight, x_squared_norms, noisy_centers) assert_array_almost_equal(inertia_array, inertia_gold) assert_array_equal(labels_array, labels_gold) # perform label assignment using the sparse CSR input x_squared_norms_from_csr = row_norms(X_csr, squared=True) labels_csr, inertia_csr = _labels_inertia(X_csr, sample_weight, x_squared_norms_from_csr, noisy_centers) assert_array_almost_equal(inertia_csr, inertia_gold) assert_array_equal(labels_csr, labels_gold)
def test_minibatch_reassign(data): # Check the reassignment part of the minibatch step with very high or very # low reassignment ratio. perfect_centers = np.empty((n_clusters, n_features)) for i in range(n_clusters): perfect_centers[i] = X[true_labels == i].mean(axis=0) x_squared_norms = row_norms(data, squared=True) sample_weight = np.ones(n_samples) centers_new = np.empty_like(perfect_centers) # Give a perfect initialization, but a large reassignment_ratio, as a # result many centers should be reassigned and the model should no longer # be good score_before = - _labels_inertia(data, sample_weight, x_squared_norms, perfect_centers, 1)[1] _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, centers_new, np.zeros(n_clusters), np.random.RandomState(0), random_reassign=True, reassignment_ratio=1) score_after = - _labels_inertia(data, sample_weight, x_squared_norms, centers_new, 1)[1] assert score_before > score_after # Give a perfect initialization, with a small reassignment_ratio, # no center should be reassigned. _mini_batch_step(data, x_squared_norms, sample_weight, perfect_centers, centers_new, np.zeros(n_clusters), np.random.RandomState(0), random_reassign=True, reassignment_ratio=1e-15) assert_allclose(centers_new, perfect_centers)
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) centers_old = centers + rng.normal(size=centers.shape) centers_old_csr = centers_old.copy() centers_new = np.zeros_like(centers_old) centers_new_csr = np.zeros_like(centers_old_csr) weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) x_squared_norms = (X ** 2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) sample_weight = np.ones(X.shape[0], dtype=X.dtype) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = sample_weight[:10] # step 1: compute the dense minibatch update old_inertia = _mini_batch_step( X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new, weight_sums, np.random.RandomState(0), random_reassign=False) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, sample_weight_mb, x_mb_squared_norms, centers_new) assert new_inertia > 0.0 assert new_inertia < old_inertia # step 2: compute the sparse minibatch update old_inertia_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr, centers_new_csr, weight_sums_csr, np.random.RandomState(0), random_reassign=False) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_allclose(centers_new, centers_new_csr) assert_allclose(old_inertia, old_inertia_csr) assert_allclose(new_inertia, new_inertia_csr)
def predict(self, X, sample_weight=None): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self) X = self._check_test_data(X) daal_ready = sample_weight is None and hasattr(X, '__array__') # or sp.isspmatrix_csr(X) if daal_ready: return _daal4py_k_means_dense(X, self.n_clusters, 0, 0.0, self.cluster_centers_, 1, None)[1] else: x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
def _predict(self, X, sample_weight=None): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self) X = _daal4py_check_test_data(self, X) _patching_status = PatchingConditionsChain( "sklearn.cluster.KMeans.predict") _patching_status.and_conditions([ (sample_weight is None, "Sample weights are not supported."), (hasattr(X, '__array__'), "X does not have '__array__' attribute.") ]) _dal_ready = _patching_status.or_conditions([(sp.isspmatrix_csr(X), "X is not sparse.")]) _patching_status.write_log() if _dal_ready: return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0] x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
def _predict(self, X, sample_weight=None): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self) X = _daal4py_check_test_data(self, X) if sample_weight is None and hasattr(X, '__array__') or \ sp.isspmatrix_csr(X): logging.info("sklearn.cluster.KMeans." "predict: " + get_patch_message("daal")) return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0] logging.info("sklearn.cluster.KMeans." "predict: " + get_patch_message("sklearn")) x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) x_squared_norms = (X ** 2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step( X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, buffer, 1, None, random_reassign=False) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) assert new_inertia > 0.0 assert new_inertia < old_inertia # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers) ** 2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, weight_sums_csr, buffer_csr, 1, None, random_reassign=False) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers) ** 2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def _fuzzykmeans_single_lloyd(X, m, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ n_samples, n_features = X.shape random_state = check_random_state(random_state) fuzzy_labels = random_state.rand(n_samples, n_clusters) fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis] sample_weight = _check_normalize_sample_weight(sample_weight, X) best_fuzzy_labels, best_labels, best_inertia, best_centers = None, None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) centers = m_step(centers, fuzzy_labels, m) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) fuzzy_labels, labels = e_step(X, centers, m) # computation of the means is also called the M-step of EM # if sp.issparse(X): # centers = _k_means._centers_sparse(X, sample_weight, labels, # n_clusters, distances) # else: # centers = _k_means._centers_dense(X, sample_weight, labels, # n_clusters, distances) centers = m_step(X, fuzzy_labels, m) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_fuzzy_labels = fuzzy_labels.copy() best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) best_fuzzy_labels, best_labels = e_step(X, centers, m) return best_fuzzy_labels, best_labels, best_inertia, best_centers, i + 1