def test_check_normalize_sample_weight(): from sklearn.cluster._kmeans import _check_normalize_sample_weight sample_weight = None checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) assert _num_samples(X) == _num_samples(checked_sample_weight) assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) assert X.dtype == checked_sample_weight.dtype
def _fuzzykmeans_single_elkan(X, m, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): if sp.issparse(X): raise TypeError("algorithm='elkan' not supported for sparse input X") n_samples, n_features = X.shape random_state = check_random_state(random_state) fuzzy_labels = random_state.rand(n_samples, n_clusters) fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis] if x_squared_norms is None: x_squared_norms = row_norms(X, squared=True) # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) centers = m_step(centers, fuzzy_labels, m) centers = np.ascontiguousarray(centers) if verbose: print('Initialization complete') checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) centers, labels, n_iter = k_means_elkan(X, checked_sample_weight, n_clusters, centers, tol=tol, max_iter=max_iter, verbose=verbose) fuzzy_labels, labels = e_step(X, centers, m) centers = m_step(X, fuzzy_labels, m) if sample_weight is None: inertia = np.sum((X - centers[labels])**2, dtype=np.float64) else: sq_distances = np.sum( (X - centers[labels])**2, axis=1, dtype=np.float64) * checked_sample_weight inertia = np.sum(sq_distances, dtype=np.float64) return fuzzy_labels, labels, inertia, centers, n_iter
def _labels_inertia(norm, X, sample_weight, centers, precompute_distances=True, distances=None): """ E step of the K-means EM algorithm. Computes the labels and the inertia of the given samples and centers. This will compute the distances in-place. Parameters ---------- norm : 'l1' or 'l2' X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features) The input samples to assign to the labels. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. centers : float array, shape (k, n_features) The cluster centers. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). distances: existing distances Returns ------- labels : int array of shape(n) The resulting assignment inertia : float Sum of squared distances of samples to their closest cluster center. """ if norm == 'l2': return _labels_inertia_skl(X, sample_weight=sample_weight, centers=centers, precompute_distances=precompute_distances, x_squared_norms=None) sample_weight = _check_normalize_sample_weight(sample_weight, X) # set the default value of centers to -1 to be able to detect any anomaly # easily if distances is None: distances = numpy.zeros(shape=(0, ), dtype=X.dtype) # distances will be changed in-place if issparse(X): raise NotImplementedError( # pragma no cover "Sparse matrix is not implemented for norm 'l1'.") if precompute_distances: return _labels_inertia_precompute_dense(norm=norm, X=X, sample_weight=sample_weight, centers=centers, distances=distances) raise NotImplementedError( # pragma no cover "precompute_distances is False, not implemented for norm 'l1'.")
def _kmeans_single_lloyd(norm, X, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, random_state=None, tol=1e-4, precompute_distances=True): """ A single run of k-means, assumes preparation completed prior. Parameters ---------- norm : 'l1' or 'l2' X : array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ random_state = check_random_state(random_state) sample_weight = _check_normalize_sample_weight(sample_weight, X) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(norm, X, n_clusters, init, random_state=random_state) if verbose: # pragma no cover print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = numpy.zeros(shape=(X.shape[0], ), dtype=X.dtype) X_sort_index = numpy.argsort(X, axis=0) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(norm, X, sample_weight, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means is also called the M-step of EM centers = _centers_dense(X, sample_weight, labels, n_clusters, distances, X_sort_index) if verbose: # pragma no cover print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = numpy.sum( numpy.abs(centers_old - centers).ravel()) if center_shift_total <= tol: if verbose: # pragma no cover print("Converged at iteration %d: " "center shift %r within tolerance %r" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(norm, X, sample_weight, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _labels_inertia_skl(X, sample_weight, x_squared_norms, centers, precompute_distances=True, distances=None): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. This will compute the distances in-place. Parameters ---------- X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features) The input samples to assign to the labels. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. x_squared_norms : array, shape (n_samples,) Precomputed squared euclidean norm of each data point, to speed up computations. centers : float array, shape (k, n_features) The cluster centers. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). distances : float array, shape (n_samples,) Pre-allocated array to be filled in with each sample's distance to the closest center. Returns ------- labels : int array of shape(n) The resulting assignment inertia : float Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] sample_weight = _check_normalize_sample_weight(sample_weight, X) # set the default value of centers to -1 to be able to detect any anomaly # easily labels = numpy.full(n_samples, -1, numpy.int32) if distances is None: distances = numpy.zeros(shape=(0, ), dtype=X.dtype) # distances will be changed in-place if issparse(X): inertia = _assign_labels_csr(X, sample_weight, x_squared_norms, centers, labels, distances=distances) else: if precompute_distances: return _labels_inertia_precompute_dense( norm='l2', X=X, sample_weight=sample_weight, centers=centers, distances=distances) inertia = _assign_labels_array(X, sample_weight, x_squared_norms, centers, labels, distances=distances) return labels, inertia