def test_x_squared_norms_init_centroids(): """Test that x_squared_norms can be None in _init_centroids""" from sklearn.cluster.k_means_ import _init_centroids X_norms = np.sum(X ** 2, axis=1) precompute = _init_centroids(X, 3, "k-means++", random_state=0, x_squared_norms=X_norms) assert_array_equal(precompute, _init_centroids(X, 3, "k-means++", random_state=0))
def pick_landmarks(args, train, dim): # Choose landmarks, we use opt_seed here. train.make_stacked() rs = check_random_state(args.opt_seed) data_fit = train.stacked_features[:, :dim] if args.landmark_choice == 'all': # Sample landmarks from all pts if args.landmarks_select == 'kmeans++': landmarks = _init_centroids(data_fit, args.structure, 'k-means++', random_state=rs) elif args.landmarks_select == 'kmeans': kmeans = KMeans(n_clusters=args.structure, random_state=rs) landmarks = kmeans.fit(data_fit).cluster_centers_ elif args.landmark_choice == 'bag': # Sample landmarks per bag size = len(train) l_size = args.n_landmark_bag landmarks = np.zeros((size * l_size, dim)) for i in range(size): bag = train[i] bag_x = bag[:, :dim] if l_size >= 1: landmarks[i * l_size:(i + 1) * l_size] = _init_centroids( bag_x, args.n_landmark_bag, 'k-means++', random_state=rs) else: raise ValueError('n_landmark_bag: Must be positive > 0') return landmarks
def test_x_squared_norms_init_centroids(): # Test that x_squared_norms can be None in _init_centroids from sklearn.cluster.k_means_ import _init_centroids X_norms = np.sum(X**2, axis=1) precompute = _init_centroids( X, 3, "k-means++", random_state=0, x_squared_norms=X_norms) assert_array_almost_equal( precompute, _init_centroids(X, 3, "k-means++", random_state=0))
def test_x_squared_norms_init_centroids(): """Test that x_squared_norms can be None in _init_centroids""" try: from sklearn.cluster.k_means_ import _init_centroids X_norms = np.sum(X**2, axis=1) precompute = _init_centroids( X, 3, "k-means++", random_state=0, x_squared_norms=X_norms) return [PY_array_equals( precompute, _init_centroids(X, 3, "k-means++", random_state=0))] except Exception: return 1
def init_uv(X, C, p): N, ndim = len(X), len(X[0]) # np.random.seed() print(p, 'test seed', np.random.random((1, ))) assert isinstance(p.method, str) if p.method == 'random': V = np.random.random((C, ndim)) # elif p.method == 'orig': # return origin_init(X, C, p.gamma, p.epsilon) elif p.method == 'kmpp': V = _init_centroids(X, C, 'k-means++') U = np.ones((N, C)) * .1 / (C - 1) for i in range(N): xi = np.repeat(X[i, :].reshape((1, ndim)), C, axis=0) U[i, np.argmin(l21_norm(xi - V, axis=1))] = .9 # w_epsilon = p.w_epsilon # from basics.ours import update_V as ours_update_V # V = ours_update_V(V, U, X, w_epsilon) return U, V
def _fit_single(X, y=None, n_clusters=2, init='random', random_state=None, metric='riemann', max_iter=100, tol=1e-4): """helper to fit a single run of centroid.""" # init random state if provided mdm = MDM(metric=metric) squared_nomrs = [numpy.linalg.norm(x, ord='fro')**2 for x in X] mdm.covmeans_ = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=squared_nomrs) if y is not None: mdm.classes_ = numpy.unique(y) else: mdm.classes_ = numpy.arange(n_clusters) labels = mdm.predict(X) k = 0 while True: old_labels = labels.copy() mdm.fit(X, old_labels) dist = mdm._predict_distances(X) labels = mdm.classes_[dist.argmin(axis=1)] k += 1 if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)): break inertia = sum([sum(dist[labels == mdm.classes_[i], i]) for i in range(len(mdm.classes_))]) return labels, inertia, mdm
def _fit_single( X, y=None, n_clusters=2, init='random', random_state=None, metric='riemann', max_iter=100, tol=1e-4): # init random state if provided mdm = MDM(metric=metric) mdm.covmeans = _init_centroids( X, n_clusters, init, random_state=random_state) if y is not None: mdm.classes = numpy.unique(y) else: mdm.classes = numpy.arange(n_clusters) labels = mdm.predict(X) k = 0 while True: old_labels = labels.copy() mdm.fit(X, old_labels) dist = mdm._predict_distances(X) labels = mdm.classes[dist.argmin(axis=1)] k += 1 if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)): break inertia = sum([sum(dist[labels == mdm.classes[i], i]) for i in range(len(mdm.classes))]) return labels, inertia, mdm
def km_init(X, K, C_init): """ Initial seeds """ N, D = X.shape if isinstance(C_init, str): if C_init == 'kmeans_plus': M = _init_centroids(X, K, init='k-means++') l = km_le(X, M, None, None) elif C_init == 'rndmeans': m = X.min(0) mm = X.max(0) a = (mm - m) * np.random.random((K, D)) M = a + m[None, :] l = km_le(X, M, None, None) elif C_init == 'rndsubset': M = X[np.random.choice(list(range(N)), K), :] # tmp = np.random.permutation(N) # M = X[tmp[0:K],:] l = km_le(X, M, None, None) elif C_init == 'kmeans': kmeans = KMeans(n_clusters=K).fit(X) l = kmeans.labels_ M = kmeans.cluster_centers_ else: M = C_init l = km_le(X, M, None, None) del C_init return M, l
def create_codebook(self, features, _class='label'): if self.debug: print '\t- creating visual codebook for {0} ...'.format(_class) print '\t- features.shape', features.shape sys.stdout.flush() n_feats, n_cuboids, cuboid_depth = features.shape features = features.reshape(-1, cuboid_depth) if self.codebook_selection == self.cs_dict["kmeans"]: codebook = KMeans(init='k-means++', n_clusters=self.codebook_size, n_init=50, tol=1e-10, max_iter=1000, random_state=self.seed, n_jobs=self.n_jobs) codebook.fit(features) return codebook else: codebook = KMeans(init='random', n_clusters=self.codebook_size, n_init=1, tol=1e-10, max_iter=1, random_state=self.seed, n_jobs=self.n_jobs) codebook.cluster_centers_ = _init_centroids(features, k=self.codebook_size, init='random', random_state=self.seed) return codebook
def _fit_single(X, y=None, n_clusters=2, init='random', random_state=None, metric='riemann', max_iter=100, tol=1e-4): # init random state if provided mdm = MDM(metric=metric) mdm.covmeans = _init_centroids(X, n_clusters, init, random_state=random_state) if y is not None: mdm.classes = numpy.unique(y) else: mdm.classes = numpy.arange(n_clusters) labels = mdm.predict(X) k = 0 while True: old_labels = labels.copy() mdm.fit(X, old_labels) dist = mdm._predict_distances(X) labels = mdm.classes[dist.argmin(axis=1)] k += 1 if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)): break inertia = sum([ sum(dist[labels == mdm.classes[i], i]) for i in range(len(mdm.classes)) ]) return labels, inertia, mdm
def _fit_single(X, y=None, n_clusters=2, init='random', random_state=None, metric='riemann', max_iter=100, tol=1e-4, n_jobs=1): """helper to fit a single run of centroid.""" # init random state if provided mdm = MDM(metric=metric, n_jobs=n_jobs) squared_nomrs = [numpy.linalg.norm(x, ord='fro')**2 for x in X] mdm.covmeans_ = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=squared_nomrs) if y is not None: mdm.classes_ = numpy.unique(y) else: mdm.classes_ = numpy.arange(n_clusters) labels = mdm.predict(X) k = 0 while True: old_labels = labels.copy() mdm.fit(X, old_labels) dist = mdm._predict_distances(X) labels = mdm.classes_[dist.argmin(axis=1)] k += 1 if (k > max_iter) | (numpy.mean(labels == old_labels) > (1 - tol)): break inertia = sum([sum(dist[labels == mdm.classes_[i], i]) for i in range(len(mdm.classes_))]) return labels, inertia, mdm
def test_x_squared_norms_init_centroids(): """Test that x_squared_norms can be None in _init_centroids""" try: from sklearn.cluster.k_means_ import _init_centroids X_norms = np.sum(X**2, axis=1) precompute = _init_centroids(X, 3, "k-means++", random_state=0, x_squared_norms=X_norms) return [ PY_array_equals(precompute, _init_centroids(X, 3, "k-means++", random_state=0)) ] except Exception: return 1
def partial_fit(self, X): #Update k means estimate on a single iteration. X = check_array(X, accept_sparse="csr") n_samples, n_features = X.shape x_squared_norms = row_norms(X, squared=True) #currently has redundancy if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) # if n_samples == 0: # return self self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) # if (not hasattr(self, 'counts_') # or not hasattr(self, 'cluster_centers_')): if (not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers self.cluster_centers_ = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms) print("Initialization complete") # self.counts_ = np.zeros(self.n_clusters, dtype=np.int32) # random_reassign = False distances = None """ if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( X, x_squared_norms, self.cluster_centers_) """ return self else: """ # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts random_reassign = self.random_state_.randint( 10 * (1 + self.counts_.min())) == 0 """ distances = np.zeros(X.shape[0], dtype=np.float64) """ _mini_batch_step(X, x_squared_norms, self.cluster_centers_, self.counts_, np.zeros(0, np.double), 0 random_reassign=random_reassign, distances=distances, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) """ self.cluster_centers_,self.inertia_ , squared_diff = _kmeans_step( X=X,x_squared_norms=x_squared_norms,centers=self.cluster_centers_, distances=distances,precompute_distances=self.precompute_distances,n_clusters=self.n_clusters) """ if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( X, x_squared_norms, self.cluster_centers_) """ return self, squared_diff
def _train(self, X, y, rs): new_X, new_y = [], [] for x_sub, yi in self.class_iter(X, y): clusters = self.get_cluster_size(x_sub) # Choose random clusters centers = _init_centroids(x_sub, clusters, 'k-means++', rs) new_X.append(centers) new_y.extend([yi] * new_X[-1].shape[0]) return np.vstack(new_X), new_y
def calc_sampling_distribution(self): x_squared_norms = row_norms(self.X, squared=True) centers = _init_centroids(self.X, self.n_clusters, self.init, random_state=self.random_state, x_squared_norms=x_squared_norms) sens = sensitivity.kmeans_sensitivity(self.X, self.w, centers, max(np.log(self.n_clusters), 1)) self.p = sens / np.sum(sens)
def init_step(dataset, model, device, pretrained, mode='kmeans',n_clusters=None): """Initialization of landmarks with k-means or k-means++ given dataset.""" if n_clusters==None: n_clusters = len(np.unique(dataset.y)) nexamples = len(dataset.x) X = torch.stack([dataset.x[i] for i in range(nexamples)]) if mode=='kmeans++': if not pretrained: # find centroids in original space landmarks = k_means_._init_centroids(X.cpu().numpy(), n_clusters, 'k-means++') landmarks = torch.tensor(landmarks, device=device) landmarks = landmarks.to(device) lndmk_encoded,_ = model(landmarks) else: X = X.to(device) encoded,_ = model(X) landmarks = k_means_._init_centroids(encoded.data.cpu().numpy(), n_clusters, 'k-means++') lndmk_encoded = torch.tensor(landmarks, device=device) elif mode=='kmeans': # run kmeans clustering if not pretrained: kmeans = KMeans(n_clusters, random_state=0).fit(X.cpu().numpy()) landmarks = torch.tensor(kmeans.cluster_centers_, device=device) landmarks = landmarks.to(device) lndmk_encoded,_ = model(landmarks) else: X = X.to(device) encoded,_ = model(X) kmeans = KMeans(n_clusters, random_state=0).fit(encoded.data.cpu().numpy()) lndmk_encoded = torch.tensor(kmeans.cluster_centers_, device=device) return lndmk_encoded
def Subspace_iter(X, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, x_squared_norms=None, random_state=None): random_state = check_random_state(random_state) centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) new_labels, new_inertia, new_centers = None, None, None distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) d_shape = X.shape[1] randomval = random_state.random_sample(d_shape ** 2).reshape(d_shape, d_shape) V_val, _ = np.linalg.qr(randomval, mode='complete') m_val = d_shape // 2 S_D = np.dot(X.T, X) P_Cluster = np.eye(m_val, M=d_shape).T for i in range(max_iter): centers_old = centers.copy() X_values = np.dot(np.dot(X, V_val), P_Cluster) centers_c = np.dot(np.dot(centers, V_val), P_Cluster) labels, _ = pairwise_distances_argmin_min(X = X_values, Y = centers_c, metric='euclidean',metric_kwargs={'squared': True}) labels = labels.astype(np.int32) centers = _k_means._centers_dense(X, labels, n_clusters, distances) S = np.zeros((d_shape, d_shape)) for it in range(n_clusters): X_it = X[:][labels == it] - centers[:][it] S += np.dot(X_it.T, X_it) Sigma = S - S_D EV, _ = np.linalg.eigh(Sigma) m = len(np.where(EV < tol_eig)[0]) P_Cluster = np.eye(m, M=d_shape).T inertia = 0.0 for j in range(n_clusters): inertia += row_norms( X[:][labels == j] - centers[:][j],squared=True).sum() if new_inertia is None or inertia < new_inertia: new_labels = labels.copy() new_centers = centers.copy() new_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: break if center_shift_total > 0: new_labels, new_inertia = _labels_inertia(X, x_squared_norms, new_centers, precompute_distances=False, distances=distances) return new_labels, new_inertia, new_centers, i + 1
def fit(self, X, w=None): if w is None: w = np.ones(X.shape[0]) elif X.shape[0] != w.shape[0]: raise ValueError( "The number of weights must match the number of data points.") x_squared_norms = row_norms(X, squared=True) self.centers = None for it in range(self.n_iter): best_centers, best_inertia = None, -1 for init_it in range(self.n_init): # initialization could be extended to consider weights centers = _init_centroids(X, self.n_clusters, self.init, random_state=self.random_state, x_squared_norms=x_squared_norms) assignment, inertia = weighted_kmeans_.assignment_inertia( X, centers) if best_inertia == -1 or w.dot(inertia) < best_inertia: best_centers = centers best_inertia = w.dot(inertia) centers = best_centers inertia = np.full((X.shape[0]), np.inf) for it in range(self.max_iter): # E-step assignment, new_inertia = weighted_kmeans_.assignment_inertia( X, centers) # M-step centers = weighted_kmeans_.update_centers( X, w, centers, assignment) if w.dot(inertia - new_inertia) <= self.tol: break inertia = new_inertia if self.centers is None or w.dot(self.inertia - new_inertia) > 0: self.inertia = new_inertia self.centers = centers
def fit(self, X, y=None): # FIXME(gilad): sub-optimal. consider using _kmeans_single_elkan. random_state = check_random_state(self.random_state) X = self._check_fit_data(X) tol = k_means_._tolerance(X, self.tol) itr = 0 init = k_means_._init_centroids(X, self.n_clusters, 'random', random_state) self.cluster_centers_ = center_updater(init, self.fixed_centers, self.n_fixed) self.inertia_ = np.infty self.inertia_prev_ = np.infty inertia_del = np.infty while itr < self.max_iter and inertia_del > tol: self.inertia_prev_ = self.inertia_ self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ k_means( X, n_clusters=self.n_clusters, init=self.cluster_centers_, n_init=self.n_init, max_iter=1, verbose=self.verbose, precompute_distances=self.precompute_distances, tol=self.tol, random_state=random_state, copy_x=self.copy_x, n_jobs=self.n_jobs, algorithm=self.algorithm, return_n_iter=True) self.cluster_centers_ = center_updater(self.cluster_centers_, self.fixed_centers, self.n_fixed) if itr > 0: inertia_del = math.fabs( (self.inertia_ - self.inertia_prev_) / self.inertia_prev_) if self.verbose: self.log.info( 'calculating for itr={}: inertia_del={}, tol={}'.format( itr, inertia_del, tol)) itr += 1 if itr < self.max_iter: self.log.info( 'convergence achieved for iteration {}. inertia={}. inertia_del={}' .format(itr, self.inertia_, inertia_del)) else: self.log.info( 'convergence not achieved. itr={}. inertia={}. inertia_del={}'. format(itr, self.inertia_, inertia_del)) return self
def kmeanspp(X, k, seed): # That we need to do this is a bug in _init_centroids x_squared_norms = row_norms(X, squared=True) # Use k-means++ to initialise the centroids centroids = _init_centroids(X, k, 'k-means++', random_state=seed, x_squared_norms=x_squared_norms) # OK, we should just short-circuit and get these from k-means++... # quick and dirty solution nns = NearestNeighbors() nns.fit(X) centroid_candidatess = nns.radius_neighbors(X=centroids, radius=0, return_distance=False) # Account for "degenerated" solutions: serveral voxels at distance 0, each becoming a centroid centroids = set() for centroid_candidates in centroid_candidatess: centroid_candidates = set(centroid_candidates) - centroids if len(set(centroid_candidates) - centroids) == 0: raise Exception('Cannot get an unambiguous set of centers;' 'theoretically this cannot happen, so check for bugs') centroids.add(centroid_candidates.pop()) return np.array(sorted(centroids))
def init_uv(X, C, *, method): N, ndim = len(X), len(X[0]) assert isinstance(method, str) if method == 'random': V = np.random.random((C, ndim)) elif method == 'orig': return origin_init(X, C) else: V = _init_centroids(X, C, method) U = np.ones((N, C)) * .1 / (C - 1) for i in range(N): xi = np.repeat(X[i, :].reshape((1, ndim)), C, axis=0) U[i, np.argmin(l21_norm(xi - V, axis=1))] = .9 return U, V
def km_init(X, K, C_init, l_init=None): """ Initial seeds """ if isinstance(C_init, str): if C_init == 'kmeans_plus': M = _init_centroids(X, K, init='k-means++') l = km_le(X, M) elif C_init == 'kmeans': kmeans = KMeans(n_clusters=K).fit(X) l = kmeans.labels_ M = kmeans.cluster_centers_ else: M = C_init.copy() # l = km_le(X,M) l = l_init.copy() del C_init, l_init return M, l
def fit(self, X, Y=None): """Compute fuzzy c-means clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) """ if Y is None: self.centers = _init_centroids(X, self.n_clusters, init=self.init, random_state=None, x_squared_norms=row_norms( X, squared=True)) else: n_labels = int(np.max(Y)) self.centers = np.zeros([n_labels + 1, np.shape(X)[1]]) for l in np.arange(n_labels + 1): self.centers[l, :] = np.mean(X[Y == l], axis=0) u, d = _init_memberships(X, self.centers, self.distance) cluster_centers, predicted_labels = \ f_k_means(X, n_clusters=self.n_clusters, m=self.m, tol_memberships=self.tol_memberships, tol_centroids=self.tol_centroids, max_iter=self.max_iter, init=self.centers, constraint=self.constraint, distance=self.distance, n_init=self.n_init) self.labels_ = predicted_labels self.cluster_centers_ = cluster_centers return self
def f_k_means(X, n_clusters, m, tol_memberships, tol_centroids, max_iter, init, constraint, distance, n_init): # if the initialization method is not 'k-means++', # an array of centroids is passed # and it is converted in float type if hasattr(init, '__array__'): n_clusters = init.shape[0] init = np.asarray(init, dtype=np.float64) # Initialize centers and memberships n_samples, n_features = X.shape centers = _init_centroids(X, n_clusters, init, random_state=True, x_squared_norms=row_norms(X, squared=True)) u, d = _init_memberships(X, centers, distance) labels = _labels_computation(u) # Choose the optimization method centers, labels, inertia, n_iter, u, fpc = \ f_k_means_main_loop(X, n_clusters, m, u, centers, d, tol_memberships, tol_centroids, max_iter, constraint, distance) return centers, labels
def kmeans_lloyd(X, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, same_cluster_size=False): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ random_state = check_random_state(random_state) if same_cluster_size: assert len(X) % n_clusters == 0, "#samples is not divisible by #clusters" if verbose: print("\n==> Starting k-means clustering...\n") sample_weight = _check_sample_weight(X, sample_weight) x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, centers, distances=distances, same_cluster_size=same_cluster_size) # computation of the means is also called the M-step of EM centers = _centers_dense( X, sample_weight, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, best_centers, distances=distances, same_cluster_size=same_cluster_size) return best_labels, best_inertia, best_centers, i + 1
if len(sys.argv) >= 4: cluster_list = get_clusters_from_file(sys.argv[3]) # Generate testset X, _ = make_blobs(n_samples=n_samples, centers=n_centers, random_state=random_state) v1 = X[:, 0] v2 = X[:, 1] # Scale to integers v1 = scale(v1) v2 = scale(v2) X = np.array(zip(v1, v2)) # Compute initial centers - using KMean++ centers = _init_centroids(X, n_centers, 'k-means++') # Write file with open("kmeans_testset.c", "w") as f: f.write("int testset_x[" + str(len(v1)) + "];\n"); f.write("int testset_y[" + str(len(v1)) + "];\n"); f.write("int testset_initial_centers_x[" + str(len(centers)) + "];\n"); f.write("int testset_initial_centers_y[" + str(len(centers)) + "];\n"); f.write("void init_dataset() {\n"); # Points i = 0 for x, y in zip(v1, v2): f.write("testset_x[" + str(i) + "] = " + str(x) + ";\n"); f.write("testset_y[" + str(i) + "] = " + str(y) + ";\n"); i += 1
def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300, init='k-means++', verbose=False, random_state=None, tol=1e-4, precompute_distances=True, sample_weight=None): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X: array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters: int The number of clusters to form as well as the number of centroids to generate. max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init: {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol: float, optional The relative increment in the results before declaring convergence. verbose: boolean, optional Verbosity mode x_squared_norms: array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- centroid: float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label: integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia: float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ if sample_weight == None: sample_weight = np.ones(X.shape[0]) random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = k_means_._init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=np.float64) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) sample_weight = np.asarray([1.0] * len(labels)) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _k_means._centers_sparse(X, sample_weight, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: if verbose: print("Converged at iteration %d" % i) break if shift > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def kmeans(X, n_clusters, delta=.001, maxiter=10, metric="cityblock", p=2, verbose=1, x_squared_norms=None): """ centres, Xtocentre, distances = kmeans( X, initial centres ... ) in: X N x dim may be sparse centres k x dim: initial centres, e.g. random.sample( X, k ) delta: relative error, iterate until the average distance to centres is within delta of the previous average distance maxiter metric: any of the 20-odd in scipy.spatial.distance "chebyshev" = max, "cityblock" = L1, "minkowski" with p= or a function( Xvec, centrevec ), e.g. Lqmetric below p: for minkowski metric -- local mod cdist for 0 < p < 1 too verbose: 0 silent, 2 prints running distances out: centres, k x dim Xtocentre: each X -> its nearest centre, ints N -> k distances, N see also: kmeanssample below, Klasy Kmeans below. """ if x_squared_norms is None: x_squared_norms = row_norms(X, squared=True) if not issparse(X): X = np.asanyarray(X) # ? centres = _init_centroids(X, n_clusters, 'k-means++', random_state=None, x_squared_norms=x_squared_norms) N, dim = X.shape k, cdim = centres.shape if dim != cdim: raise ValueError( "kmeans: X %s and centres %s must have the same number of columns" % (X.shape, centres.shape)) if verbose: print("kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % (X.shape, centres.shape, delta, maxiter, metric)) allx = np.arange(N) prevdist = 0 for jiter in range(1, maxiter + 1): D = cdist_sparse(X, centres, metric=metric, p=p) # |X| x |centres| xtoc = D.argmin(axis=1) # X -> nearest centre distances = D[allx, xtoc] avdist = distances.mean() # median ? if verbose >= 2: print("kmeans: av |X - nearest centre| = %.4g" % avdist) if (1 - delta) * prevdist <= avdist <= prevdist \ or jiter == maxiter: break prevdist = avdist for jc in range(k): # (1 pass in C) c = np.where(xtoc == jc)[0] if len(c) > 0: centres[jc] = X[c].mean(axis=0) if verbose: print("kmeans: %d iterations cluster sizes:" % jiter, np.bincount(xtoc)) if verbose >= 2: r50 = np.zeros(k) r90 = np.zeros(k) for j in range(k): dist = distances[xtoc == j] if len(dist) > 0: r50[j], r90[j] = np.percentile(dist, (50, 90)) print("kmeans: cluster 50 % radius", r50.astype(int)) print("kmeans: cluster 90 % radius", r90.astype(int)) # scale L1 / dim, L2 / sqrt(dim) ? return centres, xtoc, distances
def subspace_kmeans_single(X, sample_weight, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, verbose=False, x_squared_norms=None, random_state=None): random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # === Beginning of original implementation of initialization === # Dimensionality of original space d = X.shape[1] # Set initial V as QR-decomposed Q of random matrix rand_vals = random_state.random_sample(d**2).reshape(d, d) V, _ = np.linalg.qr(rand_vals, mode='complete') # Set initial m as d/2 m = d // 2 # Scatter matrix of the dataset in the original space S_D = np.dot(X.T, X) # Projection onto the first m attributes P_C = np.eye(m, M=d).T # === End of original implementation of initialization === # iterations for i in range(max_iter): centers_old = centers.copy() # === Beginning of original implementation of E-step of EM === X_C = np.dot(np.dot(X, V), P_C) mu_C = np.dot(np.dot(centers, V), P_C) labels, _ = pairwise_distances_argmin_min( X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True}) labels = labels.astype(np.int32) # === End of original implementation of E-step of EM === # computation of the means is also called the M-step of EM centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances) # === Beginning of original implementation of M-step of EM === S = np.zeros((d, d)) for i in range(n_clusters): X_i = X[:][labels == i] - centers[:][i] S += np.dot(X_i.T, X_i) Sigma = S - S_D evals, evecs = np.linalg.eigh(Sigma) idx = np.argsort(evals)[::1] V = evecs[:, idx] m = len(np.where(evals < tol_eig)[0]) if m == 0: raise ValueError( 'Dimensionality of clustered space is 0. ' 'The dataset is better explained by a single cluster.') P_C = np.eye(m, M=d).T inertia = 0.0 for i in range(n_clusters): inertia += row_norms(X[:][labels == i] - centers[:][i], squared=True).sum() # === End of original implementation of M-step of EM === if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight,x_squared_norms, best_centers, precompute_distances=False, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def kmeans_constrained_single(X, n_clusters, size_min=None, size_max=None, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4): """A single run of k-means constrained, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. size_min : int, optional, default: None Constrain the label assignment so that each cluster has a minimum size of size_min. If None, no constrains will be applied size_max : int, optional, default: None Constrain the label assignment so that each cluster has a maximum size of size_max. If None, no constrains will be applied n_clusters : int The number of clusters to form as well as the number of centroids to generate. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ sample_weight = np.ones(X.shape[0]) random_state = check_random_state(random_state) n_samples = X.shape[0] best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(n_samples, ), dtype=X.dtype) # Determine min and max sizes if non given if size_min is None: size_min = 0 if size_max is None: size_max = n_samples # Number of data points # Check size min and max if not ((size_min >= 0) and (size_min <= n_samples) and (size_max >= 0) and (size_max <= n_samples)): raise ValueError( "size_min and size_max must be a positive number smaller " "than the number of data points or `None`") if size_max < size_min: raise ValueError("size_max must be larger than size_min") if size_min * n_clusters > n_samples: raise ValueError( "The product of size_min and n_clusters cannot exceed the number of samples (X)" ) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_constrained(X, centers, size_min, size_max, distances=distances) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _centers_sparse(X, sample_weight, labels, n_clusters, distances) else: centers = _centers_dense(X, sample_weight, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_constrained(X, centers, size_min, size_max, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def partial_fit(self, D): """ Apply one iteration of VR_MBKM Input: self, dataset Output: self Updated: -self.curr_iter -self.curr_inner_iter -self.tot_inner_iter -self.cluster_centers_ """ ## perform checks on dataset D = check_array(D, accept_sparse='csr') if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) if self.curr_inner_iter == 0: self.inner_loop == 0 if self.curr_iter == 0 or self.inner_loop == 0 or self.update_freq == 0: ## OUTER LOOP # use the entire dataset X = D x_squared_norms = row_norms(X, squared=True) self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) if self.curr_iter == 0: ## initialize centers if hasattr(self.init, '__array__'): self.cluster_centers_ = self.init else: self.cluster_centers_ = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms, init_size=self.init_size) _, cost = k_means_._labels_inertia(X, x_squared_norms, self.cluster_centers_) #print "Cost of current initial centers on the mini-batch is %r " % cost ## initialize counts self.counts_ = np.zeros(self.n_clusters, dtype=np.int32) ## this ensures the benchmark centers are either the seeds ## or obtained from the last iterate of inner loop self.benchmark_centers = self.cluster_centers_.copy() ## run Lloyd's update with entire data distances = np.zeros(X.shape[0], dtype=np.float64) self.benchmark_updates, _, self.squared_diff = _kmeans_step( X=X, x_squared_norms=x_squared_norms, centers=self.benchmark_centers.copy(), distances=distances, precompute_distances=self.precompute_distances, n_clusters=self.n_clusters) self.cluster_centers_ = self.benchmark_updates.copy() self.curr_outer_iter += 1 self.inner_loop = 1 else: ## INNER LOOP: # use a mini-batch of data sample_idx = random.sample(range(D.shape[0]), self.mbsize) X = D[sample_idx, :] #x_squared_norms = row_norms(X, squared=True) self.set_eta() ## run VRMB_step with entire data distances = np.zeros(X.shape[0], dtype=np.float64) self.cluster_centers_, self.squared_diff, _ = VR_MB_step( X, None, self.cluster_centers_.copy(), self.benchmark_centers.copy(), self.benchmark_updates.copy(), self.counts_, self.curr_iter, np.zeros(0, np.double), 0, distances, random_reassign=False, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, learn_rate=self.set_eta()) # increment inner loop counts self.curr_inner_iter = (self.curr_inner_iter + 1) % self.update_freq # increment global loop count self.curr_iter += 1
def partial_fit(self, X, y=None, sample_weight=None): """Update k means estimate on a single mini-batch X. Parameters ---------- X : array-like of shape (n_samples, n_features) Coordinates of the data points to cluster. It must be noted that X will be copied if it is not C-contiguous. y : Ignored Not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None). Returns ------- self """ X = check_array(X, accept_sparse="csr", order="C", dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=X.dtype) if n_samples == 0: return self # unit-normalize for spherical k-means X = normalize(X) sample_weight = _check_normalize_sample_weight(sample_weight, X) x_squared_norms = row_norms(X, squared=True) self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) if (not hasattr(self, 'counts_') or not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers self.cluster_centers_ = _init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms, init_size=self.init_size) self.counts_ = np.zeros(self.n_clusters, dtype=sample_weight.dtype) random_reassign = False distances = None else: # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts random_reassign = self.random_state_.randint( 10 * (1 + self.counts_.min())) == 0 distances = np.zeros(X.shape[0], dtype=X.dtype) self.cluster_centers_ = normalize(self.cluster_centers_) _mini_batch_spherical_step(X, sample_weight, x_squared_norms, self.cluster_centers_, self.counts_, np.zeros(0, dtype=X.dtype), 0, random_reassign=random_reassign, distances=distances, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) self.cluster_centers_ = normalize(self.cluster_centers_) if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( X, sample_weight, x_squared_norms, self.cluster_centers_) return self
def fit(self, X, y=None): """Compute the centroids on X by chunking it into mini-batches. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. y : Ignored """ random_state = check_random_state(self.random_state) X = check_array(X, accept_sparse="csr", order='C', dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("Number of samples smaller than number " "of clusters.") n_init = self.n_init if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=X.dtype) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in MiniBatchKMeans instead of ' 'n_init=%d' % self.n_init, RuntimeWarning, stacklevel=2) n_init = 1 x_squared_norms = k_means_.row_norms(X, squared=True) if self.tol > 0.0: tol = k_means_._tolerance(X, self.tol) # using tol-based early stopping needs the allocation of a # dedicated before which can be expensive for high dim data: # hence we allocate it outside of the main loop old_center_buffer = np.zeros(n_features, dtype=X.dtype) else: tol = 0.0 # no need for the center buffer if tol-based early stopping is # disabled old_center_buffer = np.zeros(0, dtype=X.dtype) distances = np.zeros(self.batch_size, dtype=X.dtype) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) init_size = self.init_size if init_size is None: init_size = 3 * self.batch_size if init_size > n_samples: init_size = n_samples self.init_size_ = init_size validation_indices = random_state.randint(0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] # perform several inits with random sub-sets best_inertia = None for init_idx in range(n_init): if self.verbose: print("Init %d/%d with method: %s" % (init_idx + 1, n_init, self.init)) counts = np.zeros(self.n_clusters, dtype=np.int32) # TODO: once the `k_means` function works with sparse input we # should refactor the following init to use it instead. # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans cluster_centers = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) # Compute the label assignment on the init dataset batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=self.verbose) # Keep only the best cluster centers across independent inits on # the common validation set _, inertia = k_means_._labels_inertia(X_valid, x_squared_norms_valid, cluster_centers) if self.verbose: print("Inertia for init %d/%d: %f" % (init_idx + 1, n_init, inertia)) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = counts best_inertia = inertia # Empty context to be used inplace by the convergence check routine convergence_context = {} # Perform the iterative optimization until the final convergence # criterion for iteration_idx in range(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint(0, n_samples, self.batch_size) # Perform the actual update step on the minibatch data batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X[minibatch_indices], x_squared_norms[minibatch_indices], self.cluster_centers_, self.counts_, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) % (10 + self.counts_.min()) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) # Monitor convergence and do early stopping if necessary if k_means_._mini_batch_convergence(self, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, verbose=self.verbose): break self.n_iter_ = iteration_idx + 1 if self.compute_labels: self.labels_, self.inertia_ = self._labels_inertia_minibatch(X) return self
def _init_unit_centers(X, n_clusters, random_state, init): """Initializes unit norm centers. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) n_clusters : int, optional, default: 8 The number of clusters to form as well as the number of centroids to generate. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. init: (string) one of k-means++ : uses sklearn k-means++ initialization algorithm spherical-k-means : use centroids from one pass of spherical k-means random : random unit norm vectors random-orthonormal : random orthonormal vectors If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. """ n_examples, n_features = np.shape(X) if isinstance(init, np.ndarray): n_init_clusters, n_init_features = init.shape assert n_init_clusters == n_clusters assert n_init_features == n_features # ensure unit normed centers centers = init for cc in range(n_clusters): centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) return centers elif init == 'spherical-k-means': labels, inertia, centers, iters =\ spherical_kmeans._spherical_kmeans_single_lloyd( X, n_clusters, x_squared_norms=np.ones((n_examples, )), init='k-means++') return centers elif init == 'random': centers = np.random.randn(n_clusters, n_features) for cc in range(n_clusters): centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) return centers elif init == 'k-means++': centers = _init_centroids(X, n_clusters, 'k-means++', random_state=random_state, x_squared_norms=np.ones((n_examples, ))) for cc in range(n_clusters): centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) return centers elif init == 'random-orthonormal': centers = np.random.randn(n_clusters, n_features) q, r = np.linalg.qr(centers.T, mode='reduced') return q.T elif init == 'random-class': centers = np.zeros((n_clusters, n_features)) for cc in range(n_clusters): while np.linalg.norm(centers[cc, :]) == 0: labels = np.random.randint(0, n_clusters, n_examples) centers[cc, :] = X[labels == cc, :].sum(axis=0) for cc in range(n_clusters): centers[cc, :] = centers[cc, :] / np.linalg.norm(centers[cc, :]) return centers
def sub_kmeans_single_(self, X, sample_weight, x_squared_norms, tol, random_state): random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) centers = _init_centroids(X, self.n_clusters, init='k-means++', random_state=random_state, x_squared_norms=x_squared_norms) d = X.shape[1] # dimentionality of original space m = d // 2 # dimentionality of clustered space SD = np.dot(X.T, X) # scatter matrix of the dataset in the original space # orthonormal matrix of a rigid transformation V, _ = np.linalg.qr(random_state.random_sample(d**2).reshape(d, d), mode='complete') for i in range(self.max_iter): centers_old = centers.copy() # get the clusters' labels labels = self.assignment_step_(X=X, V=V, centers=centers, m=m) # compute new centers and sum the clusters' scatter matrices centers = _k_means._centers_dense(X, sample_weight, labels, self.n_clusters, distances) S = self.update_step_(X, centers, labels) # sorted eigenvalues and eigenvectors of SIGMA=S-SD V, m = self.eigen_decomposition_(S - SD) if m == 0: raise ValueError('Might be a single cluster (m = 0).') # inertia - sum of squared distances of samples to their closest cluster center inertia = sum([ row_norms(X[labels == j] - centers[j], squared=True).sum() for j in range(self.n_clusters) ]) # print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: # print("Converged at iteration %d: center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels match cluster centers best_labels, best_inertia = _labels_inertia( X, sample_weight, x_squared_norms, best_centers, precompute_distances=False, distances=distances) return best_centers, best_labels, best_inertia