def fit(self, X): x_squared_norms = row_norms(X, squared=True) rng = np.random.RandomState(self.random_state) if self.init == "kmeans++": # Private function of sklearn.cluster.k_means_, to get the initial centers. init_centers = _k_init(X, self.n_clusters, x_squared_norms, rng) elif self.init == "random": random_samples = rng.random_integers(0, X.shape[0], size=self.n_clusters) init_centers = X[random_samples, :] else: raise ValueError("init should be either kmeans++ or random") # Assign initial labels. skip norm of x**2 init_distances = np.sum(init_centers**2, axis=1) - 2 * np.dot(X, init_centers.T) init_labels = np.argmin(init_distances, axis=1) self.labels_ = init_labels self.centers_ = init_centers self.n_samples_ = np.zeros(self.n_clusters) # Count the number of samples in each cluster. for i in range(self.n_clusters): self.n_samples_[i] = np.sum(self.labels_ == i) for i, (sample, label) in enumerate(zip(X, self.labels_)): curr_label = label max_cost = np.inf while max_cost > 0: distances = x_squared_norms[i] - 2 * np.dot(sample, self.centers_.T) + np.sum(self.centers_**2, axis=1) curr_distance = distances[curr_label] other_distance = np.delete(distances, curr_label) curr_n_samples = self.n_samples_[curr_label] other_n_samples = np.delete(self.n_samples_, curr_label) cost = (curr_n_samples / (curr_n_samples - 1) * curr_distance) - (other_n_samples / (other_n_samples + 1) * other_distance) max_cost_ind = np.argmax(cost) max_cost = cost[max_cost_ind] if max_cost > 0: # We deleted the label index from other_n_samples if max_cost_ind > curr_label: max_cost_ind += 1 # Reassign the clusters self.labels_[i] = max_cost_ind self.centers_[curr_label] = (curr_n_samples * self.centers_[curr_label] - sample) / (curr_n_samples - 1) moved_n_samples = self.n_samples_[max_cost_ind] self.centers_[max_cost_ind] = (moved_n_samples * self.centers_[max_cost_ind] + sample) / (moved_n_samples + 1) self.n_samples_[curr_label] -= 1 self.n_samples_[max_cost_ind] += 1 curr_label = max_cost_ind
def _fit_one_init(self, X, x_squared_norms, rs): n_ts, _, d = X.shape sz = min([ts_size(ts) for ts in X]) self.cluster_centers_ = _k_init(X[:, :sz, :].reshape( (n_ts, -1)), self.n_clusters, x_squared_norms, rs).reshape( (-1, sz, d)) old_inertia = numpy.inf for it in range(self.max_iter): self._assign(X) if self.verbose: print("%.3f" % self.inertia_, end=" --> ") self._update_centroids(X) if numpy.abs(old_inertia - self.inertia_) < self.tol: break old_inertia = self.inertia_ if self.verbose: print("") return self
def _fit_one_init(self, X, x_squared_norms, rs): n_ts, sz, d = time_series_dataset_shape(X) if check_equal_size(X): X_ = to_equal_sized_dataset(X) else: X_ = TimeSeriesResampler(sz=sz).fit_transform(X) self.cluster_centers_ = _k_init(X_.reshape( (n_ts, -1)), self.n_clusters, x_squared_norms, rs).reshape( (-1, sz, d)) old_inertia = numpy.inf for it in range(self.max_iter): self._assign(X) if self.verbose: print("%.3f" % self.inertia_, end=" --> ") self._update_centroids(X) if numpy.abs(old_inertia - self.inertia_) < self.tol: break old_inertia = self.inertia_ if self.verbose: print("") return self
def kmeans(method, X, C, p, km_tol=1e-2, gd_tol=1e-3, initial_gd_step_size=0.05, num_reduction=3, batch_size=512): """ only support gradient_descent and minibatch_gradient_descent for now """ squared_norm = (X**2).sum(1) centers = _k_init(X, C, squared_norm, np.random.RandomState()) dist = np.zeros([len(X), C]) for i, c in enumerate(centers): dist[:, i] = compute_distance(c, X, p) assign = dist.argmin(1) stop = False km_ct = 0 prev_mse = 1000 reduce_count = 0 gd_step_size = initial_gd_step_size cumu_difference = [] while not stop: # print("KM Iteration", km_ct) # b = time.time() # update centers total_mse = 0 new = [] num_non_empty = 0 for I in range(C): mask = assign == I # print(I, "Number of Points", mask.sum()) if mask.sum() == 0: continue # skip empty cluster num_non_empty += 1 if mask.sum() == 1: # cluster of only one point # print("Only One Point, Skip") newc = X[mask] new.append([I, newc]) continue x0 = X[mask] c = centers[I] if method == "gd": newc, mse, ct, diff = gradient_descent(c, x0, p, step_size=gd_step_size, max_step=2000, eps=gd_tol) elif method == "sgd": newc, mse, ct, diff = minibatch_gradient_descent( c, x0, p, batch_size=batch_size, step_size=gd_step_size, max_step=2000, eps=gd_tol) elif method == "mean": newc = x0.mean(0) mse = compute_distance(newc, x0, p).mean() ct = 0 diff = 0 new.append([I, newc]) total_mse += mse for i, c in new: centers[i] = c # compute new distance and assignment dist = np.zeros([len(X), C]) for i, c in enumerate(centers): dist[:, i] = compute_distance(c, X, p) assign = dist.argmin(1) # record difference total_mse = total_mse / num_non_empty abs_diff = prev_mse - total_mse cumu_difference.append(abs_diff) prev_mse = total_mse # stop criterion diff_mean = np.abs(np.mean(cumu_difference[-8:])) if (diff_mean < km_tol): # and assign_diff < 0.001 : # print( cumu_difference[-8:] ) if reduce_count >= num_reduction: stop = True # print("Reached Reduce 3 times, Breakout") # print("Update Small Enough, Reduce GD Step Size") # reduce learning rate to improve gd_step_size = gd_step_size / 5 gd_tol = gd_tol / 5 km_tol = km_tol / 5 reduce_count += 1 km_ct += 1 # e = time.time() # print("Duration", (e-b)/60) mse = 0 for i in range(C): mask = assign == I mse += dist[mask, I].mean() mse = mse / C return centers, mse, assign, ct
def kmeans(data, K, p, normalized, eps=1e-4, optim_method="rprop", step_size=0.001, batch_size=None, mean_init=True, max_km_iteraton=100, rs=None, gpu=False): """ if batch_size is not None, do Minibatch KMeans. (Not recommended, does not provide speedup) """ model = Find_Center(dim=data.shape[-1], p=p, normalized=normalized) squared_norm = (data**2).sum(axis=1) centers = _k_init(data, K, squared_norm, np.random.RandomState(rs)) diff = 10000 prev = 1000 ct = 0 centers = torch.FloatTensor(centers) data = torch.FloatTensor(data) if gpu: centers = centers.cuda() data = data.cuda() model.cuda() best_mse = [1e6, 0] while diff / prev > eps and ct < max_km_iteraton: ct += 1 # print('iter', ct) if batch_size is not None: idx = np.random.choice(len(data), batch_size, replace=False) data_batch = data[idx] else: data_batch = data # compute assignment all_dist = [] for k in range(K): dist = torch_dp(centers[k], data_batch, p, normalized) all_dist.append(dist) value, assign = torch.stack(all_dist, 0).min(axis=0) # update centers average_mse = 0 # intra-cluster distance, similar to mean square error for euclidean distance track_niter = [] for k in range(K): mask = assign == k if mask.sum() == 0: # skip empty cluster continue if mask.sum() == 1: centers[k] = data_batch[mask][0] continue d = data_batch[mask] # data assigned to cluster k if mean_init: init_c = d.mean(axis=0) # use mean to initialize else: init_c = centers[k] if optim_method == "mean": new_c = d.mean(axis=0) se = torch_dp(new_c, d, p, normalized) niter, diff = 0, 0 else: new_c, se, niter, diff = gradient_descent_iteration( init_c, d, p, model, optim_method=optim_method, eps=eps, step_size=step_size, max_step=4000) centers[k] = new_c track_niter.append(niter) # all_niters.append(niter) if torch.isnan(new_c).any(): print("Nan!!", k, new_c) average_mse += se.sum() average_mse = average_mse / data_batch.shape[0] diff = torch.abs(average_mse - prev) prev = average_mse ### Early stop if mse stops decreasing for 10 iterations # print(ct, average_mse) if average_mse < best_mse[0]: best_mse = [average_mse, ct] if ct > best_mse[1] + 10: print("k-means early stop!") break # if minibatch, compute all data distance if batch_size is not None: all_dist = [] for k in range(K): dist = torch_dp(centers[k], data, p, normalized) all_dist.append(dist) value, assign = torch.stack(all_dist, 0).min(axis=0) average_mse = value.mean() centers = centers.cpu().detach().numpy() average_mse = average_mse.item() assign = assign.cpu().detach().numpy() return centers, average_mse, assign
def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, init_size=None, weights=None, sphered=False): """Compute the initial centroids Parameters ---------- X: array, shape (n_samples, n_features) k: int number of centroids init: {'k-means++', 'random' or ndarray or callable} optional Method for initialization random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. x_squared_norms: array, shape (n_samples,), optional Squared euclidean norm of each data point. Pass it if you have it at hands already to avoid it being recomputed here. Default: None init_size : int, optional Number of samples to randomly sample for speeding up the initialization (sometimes at the expense of accurracy): the only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than k. Returns ------- centers: array, shape(k, n_features) """ random_state = check_random_state(random_state) n_samples = X.shape[0] if init_size is not None and init_size < n_samples: if init_size < k: warnings.warn("init_size=%d should be larger than k=%d. " "Setting it to 3*k" % (init_size, k), RuntimeWarning, stacklevel=2) init_size = 3 * k init_indices = random_state.random_integers(0, n_samples - 1, init_size) X = X[init_indices] weights = weights[init_indices] if weights is not None else None x_squared_norms = x_squared_norms[init_indices] n_samples = X.shape[0] elif n_samples < k: raise ValueError("n_samples=%d should be larger than k=%d" % (n_samples, k)) if init == 'k-means++': assert weights is None and not sphered, "k-means++ initialization is not supported for weighted or sphered data." centers = _k_init(X, k, random_state=random_state, x_squared_norms=x_squared_norms) elif init == 'random': seeds = random_state.permutation(n_samples)[:k] centers = X[seeds] elif hasattr(init, '__array__'): centers = init elif callable(init): centers = init(X, k, random_state=random_state, weights=weights) else: raise ValueError("the init parameter for the k-means should " "be 'k-means++' or 'random' or an ndarray, " "'%s' (type '%s') was passed." % (init, type(init))) if sp.issparse(centers): centers = centers.toarray() if len(centers) != k: raise ValueError('The shape of the inital centers (%s) ' 'does not match the number of clusters %i' % (centers.shape, k)) return centers
def _init_w(self, V, X): """ Initialize the topics W. If self.init='k-means++', we use the init method of sklearn.cluster.KMeans. If self.init='random', topics are initialized with a Gamma distribution. If self.init='k-means', topics are initialized with a KMeans on the n-grams counts. """ if self.init == 'k-means++': if LooseVersion(sklearn_version) < LooseVersion('0.24'): W = _k_init(V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W, _ = kmeans_plusplus(V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) W = W + .1 # To avoid restricting topics to few n-grams only elif self.init == 'random': W = self.random_state.gamma(shape=self.gamma_shape_prior, scale=self.gamma_scale_prior, size=(self.n_components, self.n_vocab)) elif self.init == 'k-means': prototypes = get_kmeans_prototypes(X, self.n_components, random_state=self.random_state) W = self.ngrams_count_.transform(prototypes).A + .1 if self.add_words: W2 = self.word_count_.transform(prototypes).A + .1 W = np.hstack((W, W2)) # if k-means doesn't find the exact number of prototypes if W.shape[0] < self.n_components: if LooseVersion(sklearn_version) < LooseVersion('0.24'): W2 = _k_init(V, self.n_components - W.shape[0], x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W2, _ = kmeans_plusplus(V, self.n_components - W.shape[0], x_squared_norms=row_norms( V, squared=True), random_state=self.random_state, n_local_trials=None) W2 = W2 + .1 W = np.concatenate((W, W2), axis=0) else: raise AttributeError('Initialization method %s does not exist.' % self.init) W /= W.sum(axis=1, keepdims=True) A = np.ones((self.n_components, self.n_vocab)) * 1e-10 B = A.copy() return W, A, B
def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, init_size=None, weights=None, sphered=False): """Compute the initial centroids Parameters ---------- X: array, shape (n_samples, n_features) k: int number of centroids init: {'k-means++', 'random' or ndarray or callable} optional Method for initialization random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. x_squared_norms: array, shape (n_samples,), optional Squared euclidean norm of each data point. Pass it if you have it at hands already to avoid it being recomputed here. Default: None init_size : int, optional Number of samples to randomly sample for speeding up the initialization (sometimes at the expense of accurracy): the only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than k. Returns ------- centers: array, shape(k, n_features) """ random_state = check_random_state(random_state) n_samples = X.shape[0] if init_size is not None and init_size < n_samples: if init_size < k: warnings.warn( "init_size=%d should be larger than k=%d. " "Setting it to 3*k" % (init_size, k), RuntimeWarning, stacklevel=2) init_size = 3 * k init_indices = random_state.random_integers( 0, n_samples - 1, init_size) X = X[init_indices] weights = weights[init_indices] if weights is not None else None x_squared_norms = x_squared_norms[init_indices] n_samples = X.shape[0] elif n_samples < k: raise ValueError( "n_samples=%d should be larger than k=%d" % (n_samples, k)) if init == 'k-means++': assert weights is None and not sphered, "k-means++ initialization is not supported for weighted or sphered data." centers = _k_init(X, k, random_state=random_state, x_squared_norms=x_squared_norms) elif init == 'random': seeds = random_state.permutation(n_samples)[:k] centers = X[seeds] elif hasattr(init, '__array__'): centers = init elif callable(init): centers = init(X, k, random_state=random_state, weights=weights) else: raise ValueError("the init parameter for the k-means should " "be 'k-means++' or 'random' or an ndarray, " "'%s' (type '%s') was passed." % (init, type(init))) if sp.issparse(centers): centers = centers.toarray() if len(centers) != k: raise ValueError('The shape of the inital centers (%s) ' 'does not match the number of clusters %i' % (centers.shape, k)) return centers
def solve(self, X): while True: if isinstance(self.init, np.ndarray): assert X.shape[1] == self.init.shape[1] assert self.init.shape[0] == self.k assert X.shape[0] > self.init.shape[0] init_means = self.init elif self.init == 'random': init_means = X[self.rng.choice(X.shape[0], size=self.k, replace=False)] elif self.init == 'k-means++': squared_norms = row_norms(X, squared=True) init_means = _k_init(X, n_clusters=self.k, x_squared_norms=squared_norms, random_state=self.rng) else: raise ValueError('Got unrecognized init parameter: {}'.format( self.init)) self.labels = assign_to_closest(X, init_means) self.weights = np.array([ np.sum(self.labels == i) / X.shape[0] for i in xrange(self.k) ]) if any(self.weights < self.del_treshold) or any( self.weights * X.shape[0] < X.shape[1] + 2): if self.allow_lowering_k: self.k -= 1 if self.verbose: logger.info( "Failed initialization, decreasing k to {}".format( self.k)) else: self.seed += 1 self.rng = np.random.RandomState(self.seed) else: break self.means = np.array([ np.mean(X[np.where(self.labels == i)[0]], axis=0) for i in xrange(self.k) ]) self.covs = np.array([ np.cov(X[np.where(self.labels == i)[0]].T, bias=True) for i in xrange(self.k) ]) self.removed_clusters = self.k * [False] removed_now = False it = 0 energies = [] while it <= self.max_iter: change = False update_iter = False if removed_now: removed_now = False update_iter = True for idx, x in enumerate(X): for candidate_cl in xrange(self.k): current_cl = self.labels[idx] # skip removed cluster or x's current cluster if self.removed_clusters[ candidate_cl] or candidate_cl == current_cl: continue current_cost = self.cec_cost(current_cl) + self.cec_cost( candidate_cl) old_weights = self.weights.copy() old_means = self.means.copy() old_covs = self.covs.copy() # calculate evergy for candidte cluster self.calculate_new_params(x, candidate_cl, add=True) cost_added = self.cec_cost(candidate_cl) # calculate energy for current cluster if self.removed_clusters[current_cl]: current_cost = np.inf cost_removed = 0 else: self.calculate_new_params(x, current_cl, add=False) cost_removed = self.cec_cost(current_cl) # check if changing x's cluster would result in lower energy if (cost_removed + cost_added) < current_cost: # assign x to new cluster self.labels[idx] = candidate_cl change = True # delete small cluster if not update_iter and not self.removed_clusters[ current_cl]: if self.weights[ current_cl] < self.del_treshold or np.sum( self.labels == current_cl) < X.shape[1] + 2: if self.verbose: logger.info( "\t Deleting small cluster {}, running updating iteration" .format(current_cl)) self.removed_clusters[current_cl] = True removed_now = True self.max_iter += 1 else: self.weights = old_weights self.means = old_means self.covs = old_covs # pdb.set_trace() if not removed_now: energy = np.sum( np.array([ self.cec_cost(i) for i in range(self.k) if not self.removed_clusters[i] ])) it += 1 if it == self.max_iter: if self.verbose: logger.warning( "\t Maximum number of iterations reached, final energy: {}" .format(energy)) break if self.verbose: logger.info("\t Iter {} Enegry {}".format(it, energy)) # logger.info("Weights: {}".format(weights)) if not change: if self.verbose: logger.info("\t No switch in clusters, done") break energies.append(energy) if len(energies) > 3 and np.std(energies[-3:]) < self.tol: if self.verbose: logger.info( "\t Energy change less than tolerance, done") break alive_clusters = np.invert(self.removed_clusters) weights = self.weights[alive_clusters] means = self.means[alive_clusters] covs = self.covs[alive_clusters] return energy, self.labels, weights, means, covs
def time_kmeansplusplus(self, *args): _k_init(self.X, self.n_clusters, self.x_squared_norms, random_state=np.random.RandomState(0))
def kmeans(data, K, p, method="gd", eps=1e-4, step_size=0.1, km_max_step=3000, gd_max_step=5000, rs=None): if method == "gd": find_center = gradient_descent elif method == "nr": find_center = newton_raphson squared_norm = (data**2).sum(axis=1) centers = _k_init(data, K, squared_norm, np.random.RandomState(rs)) diff = 100 prev = -10 ct = 0 # all_niters = [] while diff > eps: ct += 1 # print('iter', ct) # begin = time.time() if ct > km_max_step: print("Kmeans reach max steps", average_mse) break # compute assignment all_dist = [] for k in range(K): dist = distance(centers[k], data, p) all_dist.append(dist) assign = np.stack(all_dist, axis=0).argmin(axis=0) # update centers average_mse = 0 # intra-cluster distance, similar to mean square error for euclidean distance track_niter = [] for k in range(K): mask = assign == k if mask.sum() == 0: # skip empty cluster continue d = data[mask] # data assigned to cluster k init_c = d.mean(axis=0) # use mean to initialize new_c, se, niter, diff = find_center(init_c, d, p, eps=eps, step_size=step_size, max_step=gd_max_step) centers[k] = new_c track_niter.append(niter) # all_niters.append(niter) # if np.isnan(new_c).any(): # print("Nan!!", k, new_c) average_mse += se average_mse = average_mse / data.shape[0] diff = np.abs(average_mse - prev) prev = average_mse # end = time.time() - begin # print("duration", end/60) # print("iteration", np.mean(track_niter)) # print("mse", average_mse) # break # if ct % 100 == 0: # print("iter", ct) return centers, average_mse, assign, ct
def kmeans_mp(m, data, K, p, method="gd", eps=1e-4, step_size=0.1, km_max_step=2000, gd_max_step=5000, rs=None): """ multiprocess kmeans m: number of process to create """ import multiprocessing as mp ctx = mp.get_context('fork') global MP_X MP_X = data pool = mp.Pool(processes=m) squared_norm = (data**2).sum(axis=1) centers = _k_init(data, K, squared_norm, np.random.RandomState(rs)) diff = 100 prev = -10 ct = 0 # all_niters = [] kwargs = {"eps": eps, "step_size": step_size, "max_step": gd_max_step} # initalize distance for the first run dist = [] for k in range(K): d = distance(centers[k], data, p) dist.append(d) assign = np.stack(dist, axis=0).argmin(axis=0) while diff > eps: ct += 1 # print('iter', ct) average_mse = 0.0 begin = time.time() # for each cluster, create input to _mp_gd_helper function inp = [] for k in range(K): mask = assign == k init = data[mask].mean(axis=0) d = [method, p, k, init, mask, kwargs] inp.append(d) # list of output from _mp_gd_helper rslt = pool.starmap(_mp_gd_helper, inp) dist = [] niters = [] mse = 0.0 for cid, new_c, Xdist, info in rslt: centers[cid] = new_c dist.append(Xdist) niters.append(info[0]) mse += info[1] # if np.isnan(new_c).any(): # print("Nan!!", k, new_c) assign = np.stack(dist, axis=0).argmin(axis=0) average_mse = mse / data.shape[ 0] # average intra-cluster distance, similar to mean square error for euclidean distance diff = np.abs(average_mse - prev) prev = average_mse # print("gd iterations", np.mean(niters)) # print("sse", average_mse) # duration = time.time() - begin # print("time %.3fm"%(duration/60)) break pool.close() return centers, average_mse, assign
x_squared_norms = row_norms(scaled_x_train, squared=True) if not sp.issparse(scaled_x_train): scaled_x_train_mean = scaled_x_train.mean(axis=0) scaled_x_train -= scaled_x_train_mean if not sp.issparse(scaled_x_test): scaled_x_test_mean = scaled_x_test.mean(axis=0) scaled_x_test -= scaled_x_test_mean if not sp.issparse(scaled_data_original): scaled_data_original_mean = scaled_data_original.mean(axis=0) scaled_data_original -= scaled_data_original_mean #Initializing the centers using k-means++ algorithm implementation of sklearn centers = _k_init(scaled_x_train, K, random_state=random_state, x_squared_norms=x_squared_norms) def find_centers(X, n_clusters, centers ): ''' Function to find centers using lloyd's algorithm. paramaeters to be passed: 1. data for which centers are to be found, 2. number of centers & 3. initial centers''' centers = centers K = np.arange(n_clusters) i = 0 while True: print("Iteration: ", i) i = i + 1
def peakmem_kmeansplusplus(self): rng = np.random.RandomState(0) _k_init(self.X, self.n_clusters, self.x_squared_norms, random_state=rng)
def k_means_gpu_sparsity(weight_vector, n_clusters, ratio=0.5, verbosity=0, seed=int(time.time()), gpu_id=0): if ratio == 0: return k_means_gpu(weight_vector=weight_vector, n_clusters=n_clusters, verbosity=verbosity, seed=seed, gpu_id=gpu_id) if ratio == 1: if n_clusters == 1: mean_sample = np.mean(weight_vector, axis=0) weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1)) return weight_vector elif weight_vector.shape[0] == n_clusters: return weight_vector else: weight_vector_1_mean = np.mean(weight_vector, axis=0) weight_vector_compress = np.zeros( (weight_vector.shape[0], weight_vector.shape[1]), dtype=np.float32) for v in weight_vector.shape[0]: weight_vector_compress[v, :] = weight_vector_1_mean return weight_vector_compress else: if n_clusters == 1: mean_sample = np.mean(weight_vector, axis=0) weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1)) return weight_vector elif weight_vector.shape[0] == n_clusters: return weight_vector elif weight_vector.shape[1] == 1: return k_means_sparsity(weight_vector, n_clusters, ratio, seed=seed) else: num_samples = weight_vector.shape[0] mean_sample = np.mean(weight_vector, axis=0) center_cluster_index = np.argsort( np.linalg.norm(weight_vector - mean_sample, axis=1))[:int(num_samples * ratio)] weight_vector_1_mean = np.mean( weight_vector[center_cluster_index, :], axis=0) remaining_cluster_index = np.asarray([ i for i in np.arange(num_samples) if i not in center_cluster_index ]) weight_vector_train = weight_vector[remaining_cluster_index, :] init_centers = k_means_._k_init(X=weight_vector_train, n_clusters=n_clusters - 1, x_squared_norms=row_norms( weight_vector_train, squared=True), random_state=RandomState(seed)) centers, labels = kmeans_cuda(samples=weight_vector_train, clusters=n_clusters - 1, init=init_centers, yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity) weight_vector_compress = np.zeros( (weight_vector.shape[0], weight_vector.shape[1]), dtype=np.float32) for v in center_cluster_index: weight_vector_compress[v, :] = weight_vector_1_mean for i, v in enumerate(remaining_cluster_index): weight_vector_compress[v, :] = centers[labels[i], :] return weight_vector_compress