def test_row_norms(): X = np.random.RandomState(42).randn(100, 100) for dtype in (np.float32, np.float64): if dtype is np.float32: precision = 4 else: precision = 5 X = X.astype(dtype) sq_norm = (X ** 2).sum(axis=1) assert_array_almost_equal(sq_norm, row_norms(X, squared=True), precision) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) for csr_index_dtype in [np.int32, np.int64]: Xcsr = sparse.csr_matrix(X, dtype=dtype) # csr_matrix will use int32 indices by default, # up-casting those to int64 when necessary if csr_index_dtype is np.int64: Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype) Xcsr.indices = Xcsr.indices.astype(csr_index_dtype) assert Xcsr.indices.dtype == csr_index_dtype assert Xcsr.indptr.dtype == csr_index_dtype assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), precision) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
def test_row_norms(): X = np.random.RandomState(42).randn(100, 100) sq_norm = (X ** 2).sum(axis=1) assert_array_almost_equal(sq_norm, row_norms(X, squared=True), 5) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X)) Xcsr = sparse.csr_matrix(X, dtype=np.float32) assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), 5) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr))
def euclidean_distances(X, Y=None): YY = row_norms(Y, squared=True)[np.newaxis, :] if X is Y: # shortcut in the common case euclidean_distances(X, X) XX = YY.T else: XX = row_norms(X, squared=True)[:, np.newaxis] distances = np.dot(X, Y.T) distances *= -2 distances += XX distances += YY np.maximum(distances, 0, out=distances) return distances
def test_get_auto_step_size(): X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64) alpha = 1.2 fit_intercept = False # sum the squares of the second sample because that's the largest max_squared_sum = 4 + 9 + 16 max_squared_sum_ = row_norms(X, squared=True).max() assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4) for fit_intercept in (True, False): step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept)) step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha + int(fit_intercept)) step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha, "squared", fit_intercept) step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log", fit_intercept) assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4) assert_almost_equal(step_size_log, step_size_log_, decimal=4) msg = 'Unknown loss function for SAG solver, got wrong instead of' assert_raise_message(ValueError, msg, get_auto_step_size, max_squared_sum_, alpha, "wrong", fit_intercept)
def fit(self, X, y): """Fit factorization machine to training data. Parameters ---------- X : array-like or sparse, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : Estimator Returns self. """ if self.degree > 3: raise ValueError("FMs with degree >3 not yet supported.") X, y = self._check_X_y(X, y) X = self._augment(X) n_features = X.shape[1] # augmented X_col_norms = row_norms(X.T, squared=True) dataset = get_dataset(X, order="fortran") rng = check_random_state(self.random_state) loss_obj = self._get_loss(self.loss) if not (self.warm_start and hasattr(self, 'w_')): self.w_ = np.zeros(n_features, dtype=np.double) if self.fit_lower == 'explicit': n_orders = self.degree - 1 else: n_orders = 1 if not (self.warm_start and hasattr(self, 'P_')): self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features) if not (self.warm_start and hasattr(self, 'lams_')): if self.init_lambdas == 'ones': self.lams_ = np.ones(self.n_components) elif self.init_lambdas == 'random_signs': self.lams_ = np.sign(rng.randn(self.n_components)) else: raise ValueError("Lambdas must be initialized as ones " "(init_lambdas='ones') or as random " "+/- 1 (init_lambdas='random_signs').") y_pred = self._get_output(X) converged = _cd_direct_ho(self.P_, self.w_, dataset, X_col_norms, y, y_pred, self.lams_, self.degree, self.alpha, self.beta, self.fit_linear, self.fit_lower == 'explicit', loss_obj, self.max_iter, self.tol, self.verbose) if not converged: warnings.warn("Objective did not converge. Increase max_iter.") return self
def compute_distances(self, x1, x2=None): """ The method - extracts normalized continuous attributes and then uses `row_norms` and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2 (the trick from sklearn); - calls a function in Cython that adds the contributions of discrete columns """ if self.normalize: x1 = x1 - self.means x1 /= np.sqrt(2 * self.vars) # adapted from sklearn.metric.euclidean_distances xx = row_norms(x1.T, squared=True)[:, np.newaxis] distances = safe_sparse_dot(x1.T, x1, dense_output=True) distances *= -2 distances += xx distances += xx.T with np.errstate(invalid="ignore"): # Nans are fixed below np.maximum(distances, 0, out=distances) distances.flat[::distances.shape[0] + 1] = 0.0 fixer = _distance.fix_euclidean_cols_normalized if self.normalize \ else _distance.fix_euclidean_cols fixer(distances, x1, self.means, self.vars) return np.sqrt(distances)
def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation rng = np.random.RandomState(42) noisy_centers = centers + rng.normal(size=centers.shape) labels_gold = - np.ones(n_samples, dtype=np.int) mindist = np.empty(n_samples) mindist.fill(np.infty) for center_id in range(n_clusters): dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1) labels_gold[dist < mindist] = center_id mindist = np.minimum(dist, mindist) inertia_gold = mindist.sum() assert_true((mindist >= 0.0).all()) assert_true((labels_gold != -1).all()) # perform label assignment using the dense array input x_squared_norms = (X ** 2).sum(axis=1) labels_array, inertia_array = _labels_inertia( X, x_squared_norms, noisy_centers) assert_array_almost_equal(inertia_array, inertia_gold) assert_array_equal(labels_array, labels_gold) # perform label assignment using the sparse CSR input x_squared_norms_from_csr = row_norms(X_csr, squared=True) labels_csr, inertia_csr = _labels_inertia( X_csr, x_squared_norms_from_csr, noisy_centers) assert_array_almost_equal(inertia_csr, inertia_gold) assert_array_equal(labels_csr, labels_gold)
def compute_distances(self, x1, x2=None): """ The method - extracts normalized continuous attributes and then uses `row_norms` and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2 (the trick from sklearn); - calls a function in Cython that recomputes the distances between pairs of rows that yielded nan - calls a function in Cython that adds the contributions of discrete columns """ if self.continuous.any(): data1, data2 = self.continuous_columns( x1, x2, self.means, np.sqrt(2 * self.vars)) # adapted from sklearn.metric.euclidean_distances xx = row_norms(data1, squared=True)[:, np.newaxis] if x2 is not None: yy = row_norms(data2, squared=True)[np.newaxis, :] else: yy = xx.T distances = safe_sparse_dot(data1, data2.T, dense_output=True) distances *= -2 distances += xx distances += yy with np.errstate(invalid="ignore"): # Nans are fixed below np.maximum(distances, 0, out=distances) if x2 is None: distances.flat[::distances.shape[0] + 1] = 0.0 fixer = _distance.fix_euclidean_rows_normalized if self.normalize \ else _distance.fix_euclidean_rows fixer(distances, data1, data2, self.means, self.vars, self.dist_missing2_cont, x2 is not None) else: distances = np.zeros((x1.shape[0], (x2 if x2 is not None else x1).shape[0])) if np.any(self.discrete): data1, data2 = self.discrete_columns(x1, x2) _distance.euclidean_rows_discrete( distances, data1, data2, self.dist_missing_disc, self.dist_missing2_disc, x2 is not None) if x2 is None: _distance.lower_to_symmetric(distances) return np.sqrt(distances)
def get_kpp_init(X,n_clusters,random_state=None): random_state = None random_state = check_random_state(random_state) x_squared_norms = row_norms(X, squared=True) centers = sklearn.cluster.k_means_._k_init(X, n_clusters, random_state=random_state,x_squared_norms=x_squared_norms) # n_clusters x D W = np.transpose( centers ) # D x D^(1) W_tf = tf.constant(W) return centers,W,W_tf
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4): from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums) sc = SparkContext(conf=conf) data = sc.parallelize(X) data.cache() random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None x_squared_norms = row_norms(X, squared=True) # x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect() # x_squared_norms = np.array(x_squared_norms, dtype='float64') centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms) bs = X.shape[0]/worker_nums data_temp = [] for i in range(worker_nums-1): data_temp.append(X[i*bs:(i+1)*bs]) data_temp.append(X[(worker_nums-1)*bs:]) data_temp = np.array(data_temp, dtype='float64') data_temp = sc.parallelize(data_temp) data_temp.cache() for i in range(max_iter): centers_old = centers.copy() all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect() temp_all_distances = all_distances[0] for i in range(1, worker_nums): temp_all_distances = np.hstack((temp_all_distances, all_distances[i])) all_distances = temp_all_distances # all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect() # # reshape, from (1, n_samples, k) to (k, n_samples) # all_distances = np.asarray(all_distances, dtype="float64").T[0] # Assignment, also called E-step of EM labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances) # re-computation of the centroids, also called M-step of EM centers = _centers(X, labels, n_clusters) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: break return best_centers, best_labels, best_inertia
def test_row_norms(): X = np.random.RandomState(42).randn(100, 100) for dtype in (np.float32, np.float64): if dtype is np.float32: precision = 4 else: precision = 5 X = X.astype(dtype) sq_norm = (X ** 2).sum(axis=1) assert_array_almost_equal(sq_norm, row_norms(X, squared=True), precision) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) Xcsr = sparse.csr_matrix(X, dtype=dtype) assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), precision) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
def get_auto_step_size(X, alpha, loss, gamma=None, sample_weight=None): """Compute automatic step size for SAG solver Stepsize computed using the following objective: minimize_w 1 / n_samples * \sum_i loss(w^T x_i, y_i) + alpha * 0.5 * ||w||^2_2 Parameters ---------- X : ndarray Array of samples x_i. alpha : float Constant that multiplies the l2 penalty term. loss : string, in {"log", "squared"} The loss function used in SAG solver. Returns ------- step_size : float Step size used in SAG/SAGA solver. """ if sample_weight is None: weighted_norms = row_norms(X, squared=True) else: weighted_norms = sample_weight * row_norms(X, squared=True) L = np.max(weighted_norms) n_samples = X.shape[0] if loss == 'log': # inverse Lipschitz constant for log loss lipschitz_constant = 0.25 * L + alpha elif loss == 'squared': lipschitz_constant = L + alpha elif loss == 'modified_huber': lipschitz_constant = 2 * L + alpha elif loss == 'smooth_hinge': lipschitz_constant = L + gamma + alpha elif loss == 'squared_hinge': lipschitz_constant = 2 * L + alpha else: raise ValueError("`auto` stepsize is only available for `squared` or " "`log` losses (got `%s` loss). Please specify a " "stepsize." % loss) return 1.0 / lipschitz_constant
def fit(self, X): x_squared_norms = row_norms(X, squared=True) rng = np.random.RandomState(self.random_state) if self.init == "kmeans++": # Private function of sklearn.cluster.k_means_, to get the initial centers. init_centers = _k_init(X, self.n_clusters, x_squared_norms, rng) elif self.init == "random": random_samples = rng.random_integers(0, X.shape[0], size=self.n_clusters) init_centers = X[random_samples, :] else: raise ValueError("init should be either kmeans++ or random") # Assign initial labels. skip norm of x**2 init_distances = np.sum(init_centers**2, axis=1) - 2 * np.dot(X, init_centers.T) init_labels = np.argmin(init_distances, axis=1) self.labels_ = init_labels self.centers_ = init_centers self.n_samples_ = np.zeros(self.n_clusters) # Count the number of samples in each cluster. for i in range(self.n_clusters): self.n_samples_[i] = np.sum(self.labels_ == i) for i, (sample, label) in enumerate(zip(X, self.labels_)): curr_label = label max_cost = np.inf while max_cost > 0: distances = x_squared_norms[i] - 2 * np.dot(sample, self.centers_.T) + np.sum(self.centers_**2, axis=1) curr_distance = distances[curr_label] other_distance = np.delete(distances, curr_label) curr_n_samples = self.n_samples_[curr_label] other_n_samples = np.delete(self.n_samples_, curr_label) cost = (curr_n_samples / (curr_n_samples - 1) * curr_distance) - (other_n_samples / (other_n_samples + 1) * other_distance) max_cost_ind = np.argmax(cost) max_cost = cost[max_cost_ind] if max_cost > 0: # We deleted the label index from other_n_samples if max_cost_ind > curr_label: max_cost_ind += 1 # Reassign the clusters self.labels_[i] = max_cost_ind self.centers_[curr_label] = (curr_n_samples * self.centers_[curr_label] - sample) / (curr_n_samples - 1) moved_n_samples = self.n_samples_[max_cost_ind] self.centers_[max_cost_ind] = (moved_n_samples * self.centers_[max_cost_ind] + sample) / (moved_n_samples + 1) self.n_samples_[curr_label] -= 1 self.n_samples_[max_cost_ind] += 1 curr_label = max_cost_ind
def prepare_data(x): if self.discrete.any(): data = Cosine.discrete_to_indicators(x, self.discrete) else: data = x.copy() for col, mean in enumerate(self.means): column = data[:, col] column[np.isnan(column)] = mean if self.axis == 0: data = data.T data /= row_norms(data)[:, np.newaxis] return data
def kmeans_subsample(X, n_clusters, random_state=None, n_local_trials=10): random_state = check_random_state(random_state) n_samples, n_features = X.shape x_squared_norms = row_norms(X, squared=True) centers = np.empty((n_clusters, n_features)) # Pick first center randomly center_id = random_state.randint(n_samples) centers[0] = X[center_id] # Initialize list of closest distances and calculate current potential closest_dist_sq = euclidean_distances(centers[0].reshape(1, -1), X, Y_norm_squared=x_squared_norms, squared=True) current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals) # Compute distances to center candidates distance_to_candidates = euclidean_distances(X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) # Decide which candidate is the best best_candidate = None best_pot = None best_dist_sq = None for trial in range(n_local_trials): # Compute potential when including center candidate new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidates[trial]) new_pot = new_dist_sq.sum() # Store result if it is the best local trial so far if (best_candidate is None) or (new_pot < best_pot): best_candidate = candidate_ids[trial] best_pot = new_pot best_dist_sq = new_dist_sq # Permanently add best center candidate found in local tries centers[c] = X[best_candidate] current_pot = best_pot closest_dist_sq = best_dist_sq return centers
def kmeanspp(X, k, seed): # That we need to do this is a bug in _init_centroids x_squared_norms = row_norms(X, squared=True) # Use k-means++ to initialise the centroids centroids = _init_centroids(X, k, 'k-means++', random_state=seed, x_squared_norms=x_squared_norms) # OK, we should just short-circuit and get these from k-means++... # quick and dirty solution nns = NearestNeighbors() nns.fit(X) centroid_candidatess = nns.radius_neighbors(X=centroids, radius=0, return_distance=False) # Account for "degenerated" solutions: serveral voxels at distance 0, each becoming a centroid centroids = set() for centroid_candidates in centroid_candidatess: centroid_candidates = set(centroid_candidates) - centroids if len(set(centroid_candidates) - centroids) == 0: raise Exception('Cannot get an unambiguous set of centers;' 'theoretically this cannot happen, so check for bugs') centroids.add(centroid_candidates.pop()) return np.array(sorted(centroids))
def _init_centroids(X, k, init, random_state, x_squared_norms=None): random_state = check_random_state(random_state) n_samples = X.shape[0] if x_squared_norms is None: x_squared_norms = row_norms(X, squared=True) if n_samples < k: raise ValueError("n_samples=%d should be larger than k=%d"%(n_samples, k)) if init == 'k-means++': centers = _k_init(X, k, random_state=random_state, x_squared_norms=x_squared_norms) elif init == 'random': seeds = random_state.permutation(n_samples)[:k] centers = X[seeds] return centers
def test_get_auto_step_size(): X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64) alpha = 1.2 fit_intercept = False # sum the squares of the second sample because that's the largest max_squared_sum = 4 + 9 + 16 max_squared_sum_ = row_norms(X, squared=True).max() n_samples = X.shape[0] assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4) for saga in [True, False]: for fit_intercept in (True, False): if saga: L_sqr = (max_squared_sum + alpha + int(fit_intercept)) L_log = (max_squared_sum + 4.0 * alpha + int(fit_intercept)) / 4.0 mun_sqr = min(2 * n_samples * alpha, L_sqr) mun_log = min(2 * n_samples * alpha, L_log) step_size_sqr = 1 / (2 * L_sqr + mun_sqr) step_size_log = 1 / (2 * L_log + mun_log) else: step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept)) step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha + int(fit_intercept)) step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha, "squared", fit_intercept, n_samples=n_samples, is_saga=saga) step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log", fit_intercept, n_samples=n_samples, is_saga=saga) assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4) assert_almost_equal(step_size_log, step_size_log_, decimal=4) msg = 'Unknown loss function for SAG solver, got wrong instead of' assert_raise_message(ValueError, msg, get_auto_step_size, max_squared_sum_, alpha, "wrong", fit_intercept)
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ #check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
def run_step(self,run_number,step_size,howlong): df_slot = self.get_input_slot('df') df_slot.update(run_number, buffer_created=True, buffer_updated=True) if df_slot.has_deleted(): self.reset() df_slot.reset() df_slot.update(run_number) input_df = df_slot.data() columns = self.get_columns(input_df) if input_df is None or len(input_df)==0: return self._return_run_step(self.state_blocked, steps_run=0) indices = df_slot.next_created(step_size) steps = indices_len(indices) step_size -= steps steps_run = steps if steps != 0: indices = fix_loc(indices) self._buffer.append(input_df.loc[indices]) self._df = self._buffer.df() self._df.loc[indices,self.UPDATE_COLUMN] = run_number if step_size > 0 and df_slot.has_updated(): indices = df_slot.next_updated(step_size,as_slice=False) steps = indices_len(indices) if steps != 0: steps_run += steps indices = fix_loc(indices) # no need, but stick to the stereotype updated = self.filter_columns(input_df, indices) df = self.filter_columns(self._df, indices) norms = row_norms(updated-df) selected = (norms > (self._delta*self.get_scale())) indices = indices[selected] if selected.any(): logger.debug('updating at %d', run_number) self._df.loc[indices, self._columns] = updated.loc[indices, self._columns] self._df.loc[indices, self.UPDATE_COLUMN] = run_number else: logger.debug('Not updating at %d', run_number) return self._return_run_step(df_slot.next_state(), steps_run=steps_run)
def _kmeans_single(X, n_clusters, max_iter=300, init='k-means++', random_state=None, tol=1e-4): random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init x_squared_norms = row_norms(X, squared=True) centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms) # distances = np.zeros(shape=(X.shape[0],), dtype=np.float64) # iterations for i in range(max_iter): centers_old = centers.copy() # Assignment, also called E-step of EM labels, inertia = _labels_inertia(X, x_squared_norms, centers) # re-computation of the centroids, also called M-step of EM centers = _centers(X, labels, n_clusters) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: break if shift > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers) return best_centers, best_labels, best_inertia
from sklearn.utils import check_random_state from sklearn.cluster import KMeans as skKMeans # %% # data = pd.read_csv('s1.csv', sep=',') data = pd.read_csv('s1.csv', sep=',') # %% scaler = MinMaxScaler() data_scaled = pd.DataFrame(scaler.fit_transform(data)) # %% # K-means++ für erste Cluster N = 15 random_state = 42 fit_data = np.asarray(data_scaled) x_squared_norms = row_norms(fit_data, squared=True) random = check_random_state(random_state) initial_clusters = k_init(fit_data, N, x_squared_norms, random) max_norm = KMeans(N, initial_clusters, order=np.inf) max_norm.fit(fit_data) manhattan = KMeans(N, initial_clusters, order=1) manhattan.fit(fit_data) euclid = KMeans(N, initial_clusters, order=2) euclid.fit(fit_data) # %% fig, axs = plt.subplots(2, 3) data_scaled['max'] = max_norm.labels center = max_norm.centroids axs[0, 0].scatter(data_scaled[0], data_scaled[1],
data = data_original[idx] labels = labels_original[idx] x_train, x_test, y_train, y_test = train_test_split(data, labels,test_size=0.20) #Scaling of features scaler = StandardScaler() scaled_x_train = scaler.fit_transform(x_train) scaled_x_test = scaler.transform(x_test) scaled_data_original = scaler.transform(data_original) random_state = 0 K = 5896 gamma = 177.82 random_state = check_random_state(random_state) x_squared_norms = row_norms(scaled_x_train, squared=True) if not sp.issparse(scaled_x_train): scaled_x_train_mean = scaled_x_train.mean(axis=0) scaled_x_train -= scaled_x_train_mean if not sp.issparse(scaled_x_test): scaled_x_test_mean = scaled_x_test.mean(axis=0) scaled_x_test -= scaled_x_test_mean if not sp.issparse(scaled_data_original): scaled_data_original_mean = scaled_data_original.mean(axis=0) scaled_data_original -= scaled_data_original_mean #Initializing the centers using k-means++ algorithm implementation of sklearn centers = _k_init(scaled_x_train, K, random_state=random_state, x_squared_norms=x_squared_norms)
from sklearn import metrics from sklearn.cluster import KMeans import numpy as np from time import time from sklearn.utils.extmath import row_norms, squared_norm from sklearn.metrics.pairwise import euclidean_distances k = 10 csv = np.genfromtxt('census_50k.csv', delimiter=",") sh = csv.shape mu = np.ones((k, sh[1])) x_squared_norms = row_norms(csv, squared=True) t0 = time() for i in range(100): all_distances = euclidean_distances(mu, csv, x_squared_norms, squared=True) mu = mu + 1 t = time() - t0 print t
def convert_sklearn_kmeans(scope, operator, container): """ Computation graph of distances to all centroids for a batch of examples. Note that a centriod is just the center of a cluster. We use ``[]`` to denote the dimension of a variable; for example, ``X[3, 2]`` means that *X* is a *3-by-2* tensor. In addition, for a matrix *X*, $X'$ denotes its transpose. Symbols: * *l*: # of examples. * *n*: # of features per input example. * *X*: input examples, l-by-n tensor. * *C*: centroids, k-by-n tensor. * $C^2$: 2-norm of all centriod vectors, its shape is ``[k]``. * *Y*: 2-norm of difference between examples and centroids, *l-by-k* tensor. The value at i-th row and k-th column row, ``Y[i,k]``,is the distance from example *i* to centroid *k*. * *L*: the id of the nearest centroid for each input example, its shape is ``[l]``. :: .------------------------------------------------------. | | | v X [l, n] --> ReduceSumSquare -> X^2 [l] Gemm (alpha=-2, transB=1) <- C [k, n] | | | v `------> Add <-- -2XC' [l, k] | v C^2 [k] --------> Add <----- Z [l, k] | v L [l] <-- ArgMin <-- Y2 [l, k] --> Sqrt --> Y2 [l, k] *scikit-learn* code: :: X = data Y = model.cluster_centers_ XX = row_norms(X, squared=True) YY = row_norms(Y, squared=True) distances = safe_sparse_dot(X, Y.T, dense_output=True) distances *= -2 distances += XX[:, numpy.newaxis] distances += YY[numpy.newaxis, :] numpy.sqrt(distances, out=distances) """ op = operator.raw_operator variable = operator.inputs[0] N = variable.type.shape[0] # centroids shapeC = list(op.cluster_centers_.shape) nameC = scope.get_unique_variable_name('centroid') container.add_initializer(nameC, onnx_proto.TensorProto.FLOAT, shapeC, op.cluster_centers_.flatten()) nameX2 = scope.get_unique_variable_name('X2') nameX = operator.inputs[0].full_name container.add_node('ReduceSumSquare', [nameX], [nameX2], axes=[1], keepdims=1, name=scope.get_unique_operator_name('ReduceSumSquare')) # Compute -2XC' zero_name = scope.get_unique_variable_name('zero') zeros = np.zeros((N, )) container.add_initializer(zero_name, onnx_proto.TensorProto.FLOAT, list(zeros.shape), zeros) nameXC2 = scope.get_unique_variable_name('XC2') apply_gemm(scope, [nameX, nameC, zero_name], [nameXC2], container, alpha=-2., transB=1) # Compute Z = X^2 - 2XC' nameZ = scope.get_unique_variable_name("Z") apply_add(scope, [nameXC2, nameX2], [nameZ], container) # centroids ^2 nameC2 = scope.get_unique_variable_name('C2') c2 = row_norms(op.cluster_centers_, squared=True) container.add_initializer(nameC2, onnx_proto.TensorProto.FLOAT, [1, shapeC[0]], c2.flatten()) # Compute Y2 = Z + C^2 nameY2 = scope.get_unique_variable_name('Y2') apply_add(scope, [nameZ, nameC2], [nameY2], container) # Compute Y = sqrt(Y2) nameY = operator.outputs[1].full_name apply_sqrt(scope, [nameY2], [nameY], container) # Compute the most-matched cluster index, L nameL = operator.outputs[0].full_name container.add_node('ArgMin', [nameY2], [nameL], name=scope.get_unique_operator_name('ArgMin'), axis=1, keepdims=0)
def row_norms(X, squared=False): if isinstance(X, np.ndarray): return skm.row_norms(X, squared=squared) return X.map_blocks( skm.row_norms, chunks=(X.chunks[0],), drop_axis=1, squared=squared )
def k_means_gpu_sparsity(weight_vector, n_clusters, ratio=0.5, verbosity=0, seed=int(time.time()), gpu_id=0): if ratio == 0: return k_means_gpu(weight_vector=weight_vector, n_clusters=n_clusters, verbosity=verbosity, seed=seed, gpu_id=gpu_id) if ratio == 1: if n_clusters == 1: mean_sample = np.mean(weight_vector, axis=0) weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1)) return weight_vector elif weight_vector.shape[0] == n_clusters: return weight_vector else: weight_vector_1_mean = np.mean(weight_vector, axis=0) weight_vector_compress = np.zeros( (weight_vector.shape[0], weight_vector.shape[1]), dtype=np.float32) for v in weight_vector.shape[0]: weight_vector_compress[v, :] = weight_vector_1_mean return weight_vector_compress else: if n_clusters == 1: mean_sample = np.mean(weight_vector, axis=0) weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1)) return weight_vector elif weight_vector.shape[0] == n_clusters: return weight_vector elif weight_vector.shape[1] == 1: return k_means_sparsity(weight_vector, n_clusters, ratio, seed=seed) else: num_samples = weight_vector.shape[0] mean_sample = np.mean(weight_vector, axis=0) center_cluster_index = np.argsort( np.linalg.norm(weight_vector - mean_sample, axis=1))[:int(num_samples * ratio)] weight_vector_1_mean = np.mean( weight_vector[center_cluster_index, :], axis=0) remaining_cluster_index = np.asarray([ i for i in np.arange(num_samples) if i not in center_cluster_index ]) weight_vector_train = weight_vector[remaining_cluster_index, :] init_centers = k_means_._k_init(X=weight_vector_train, n_clusters=n_clusters - 1, x_squared_norms=row_norms( weight_vector_train, squared=True), random_state=RandomState(seed)) centers, labels = kmeans_cuda(samples=weight_vector_train, clusters=n_clusters - 1, init=init_centers, yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity) weight_vector_compress = np.zeros( (weight_vector.shape[0], weight_vector.shape[1]), dtype=np.float32) for v in center_cluster_index: weight_vector_compress[v, :] = weight_vector_1_mean for i, v in enumerate(remaining_cluster_index): weight_vector_compress[v, :] = centers[labels[i], :] return weight_vector_compress
def _initialize_nrkmeans_parameters(X, n_clusters, V, m, P, centers, max_iter, random_state): """ Initialize the input parameters form NrKmeans. This means that all input values which are None must be defined. Also all input parameters which are not None must be checked, if a correct execution is possible. :param X: input data :param n_clusters: list containing number of clusters for each subspace :param V: orthogonal rotation matrix :param m: list containing number of dimensionalities for each subspace :param P: list containing projections for each subspace :param centers: list containing the cluster centers for each subspace :param max_iter: maximum number of iterations for the algorithm :param random_state: use a fixed random state to get a repeatable solution :return: checked V, m, P, centers, random_state, number of subspaces, labels, scatter_matrices """ data_dimensionality = X.shape[1] random_state = check_random_state(random_state) # Check if n_clusters is a list if not type(n_clusters) is list: raise ValueError( "Number of clusters must be specified for each subspace and therefore be a list.\nYour input:\n" + str(n_clusters)) # Check if n_clusters contains negative values if len([x for x in n_clusters if x < 1]) > 0: raise ValueError( "Number of clusters must not contain negative values or 0.\nYour input:\n" + str(n_clusters)) # Check if n_clusters contains more than one noise space nr_noise_spaces = len([x for x in n_clusters if x == 1]) if nr_noise_spaces > 1: raise ValueError( "Only one subspace can be the noise space (number of clusters = 1).\nYour input:\n" + str(n_clusters)) # Check if noise space is not the last member in n_clusters if nr_noise_spaces != 0 and n_clusters[-1] != 1: raise ValueError( "Noise space (number of clusters = 1) must be the last entry in n_clusters.\nYour input:\n" + str(n_clusters)) # Get number of subspaces subspaces = len(n_clusters) # Check if V is orthogonal if V is None: V = ortho_group.rvs(dim=data_dimensionality, random_state=random_state) if not _is_matrix_orthogonal(V): raise Exception("Your input matrix V is not orthogonal.\nV:\n" + str(V)) # Calculate dimensionalities m if m is None and P is None: m = [int(data_dimensionality / subspaces)] * subspaces if data_dimensionality % subspaces != 0: choices = random_state.choice(range(subspaces), data_dimensionality - sum(m)) for choice in choices: m[choice] += 1 # If m is None but P is defined use P's dimensionality elif m is None: m = [len(x) for x in P] if not type(m) is list or not len(m) is subspaces: raise ValueError( "A dimensionality list m must be specified for each subspace.\nYour input:\n" + str(m)) # Calculate projections P if P is None: possible_projections = list(range(data_dimensionality)) P = [] for dimensionality in m: choices = random_state.choice(possible_projections, dimensionality, replace=False) P.append(choices) possible_projections = list( set(possible_projections) - set(choices)) if not type(P) is list or not len(P) is subspaces: raise ValueError( "Projection lists must be specified for each subspace.\nYour input:\n" + str(P)) else: # Check if the length of entries in P matches values of m used_dimensionalities = [] for i, dimensionality in enumerate(m): used_dimensionalities.extend(P[i]) if not len(P[i]) == dimensionality: raise ValueError( "Values for dimensionality m and length of projection list P do not match.\nDimensionality m:\n" + str(dimensionality) + "\nDimensionality P:\n" + str(P[i])) # Check if every dimension in considered in P if sorted(used_dimensionalities) != list(range(data_dimensionality)): raise ValueError( "Projections P must include all dimensionalities.\nYour used dimensionalities:\n" + str(used_dimensionalities)) # Define initial cluster centers with kmeans++ for each subspace if centers is None: centers = [] for i in range(subspaces): k = n_clusters[i] if k > 1: P_subspace = P[i] cropped_X = np.matmul(X, V[:, P_subspace]) centers_cropped = kpp(cropped_X, k, row_norms(cropped_X, squared=True), random_state) labels, _ = pairwise_distances_argmin_min( X=cropped_X, Y=centers_cropped, metric='euclidean', metric_kwargs={'squared': True}) centers_sub = np.zeros((k, X.shape[1])) # Update cluster parameters for center_id, _ in enumerate(centers_sub): # Get points in this cluster points_in_cluster = np.where(labels == center_id)[0] # Update center centers_sub[center_id] = np.average(X[points_in_cluster], axis=0) centers.append(centers_sub) else: centers.append(np.expand_dims(np.average(X, axis=0), 0)) if not type(centers) is list or not len(centers) is subspaces: raise ValueError( "Cluster centers must be specified for each subspace.\nYour input:\n" + str(centers)) else: # Check if number of centers for subspaces matches value in n_clusters for i, subspace_centers in enumerate(centers): if not n_clusters[i] == len(subspace_centers): raise ValueError( "Values for number of clusters n_clusters and number of centers do not match.\nNumber of clusters:\n" + str(n_clusters[i]) + "\nNumber of centers:\n" + str(len(subspace_centers))) # Check max iter if max_iter is None or type(max_iter) is not int or max_iter <= 0: raise ValueError( "Max_iter must be an integer larger than 0. Your Max_iter:\n" + str(max_iter)) # Initial labels and scatter matrices labels = [None] * subspaces scatter_matrices = [None] * subspaces return V, m, P, centers, random_state, subspaces, labels, scatter_matrices
def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', regularization=None, copy_cov=True, init=None, max_iter=1000): """Generic sparse coding Each column of the result is the solution to a Lasso problem. Parameters ---------- X: array of shape (n_samples, n_features) Data matrix. dictionary: array of shape (n_components, n_features) The dictionary matrix against which to solve the sparse coding of the data. Some of the algorithms assume normalized rows. gram: None | array, shape=(n_components, n_components) Precomputed Gram matrix, dictionary * dictionary' gram can be None if method is 'threshold'. cov: array, shape=(n_components, n_samples) Precomputed covariance, dictionary * X' algorithm: {'lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'} lars: uses the least angle regression method (linear_model.lars_path) lasso_lars: uses Lars to compute the Lasso solution lasso_cd: uses the coordinate descent method to compute the Lasso solution (linear_model.Lasso). lasso_lars will be faster if the estimated components are sparse. omp: uses orthogonal matching pursuit to estimate the sparse solution threshold: squashes to zero all coefficients less than regularization from the projection dictionary * data' regularization : int | float The regularization parameter. It corresponds to alpha when algorithm is 'lasso_lars', 'lasso_cd' or 'threshold'. Otherwise it corresponds to n_nonzero_coefs. init: array of shape (n_samples, n_components) Initialization value of the sparse code. Only used if `algorithm='lasso_cd'`. max_iter: int, 1000 by default Maximum number of iterations to perform if `algorithm='lasso_cd'`. copy_cov: boolean, optional Whether to copy the precomputed covariance matrix; if False, it may be overwritten. Returns ------- code: array of shape (n_components, n_features) The sparse codes See also -------- sklearn.linear_model.lars_path sklearn.linear_model.orthogonal_mp sklearn.linear_model.Lasso SparseCoder """ if X.ndim == 1: X = X[:, np.newaxis] n_samples, n_features = X.shape if cov is None and algorithm != 'lasso_cd': # overwriting cov is safe copy_cov = False cov = np.dot(dictionary, X.T) if algorithm == 'lasso_admm': alpha = float(regularization) / n_features # account for scaling try: err_mgt = np.seterr(all='ignore') code, dictionary = lasso_admm(X.T, dictionary.T, gamma=alpha, gram=gram, cov=cov, max_iter=max_iter) new_code = code.T finally: np.seterr(**err_mgt) elif algorithm == 'lasso_lars': alpha = float(regularization) / n_features # account for scaling try: err_mgt = np.seterr(all='ignore') lasso_lars = LassoLars(alpha=alpha, fit_intercept=False, verbose=False, normalize=False, precompute=gram, fit_path=False) lasso_lars.fit(dictionary.T, X.T, Xy=cov) new_code = lasso_lars.coef_ finally: np.seterr(**err_mgt) elif algorithm == 'lasso_cd': alpha = float(regularization) / n_features # account for scaling clf = Lasso(alpha=alpha, fit_intercept=False, precompute=gram, max_iter=max_iter, warm_start=True) clf.coef_ = init clf.fit(dictionary.T, X.T) new_code = clf.coef_ elif algorithm == 'lars': try: err_mgt = np.seterr(all='ignore') lars = Lars(fit_intercept=False, verbose=False, normalize=False, precompute=gram, n_nonzero_coefs=int(regularization), fit_path=False) lars.fit(dictionary.T, X.T, Xy=cov) new_code = lars.coef_ finally: np.seterr(**err_mgt) elif algorithm == 'threshold': new_code = ((np.sign(cov) * np.maximum(np.abs(cov) - regularization, 0)).T) elif algorithm == 'omp': new_code = orthogonal_mp_gram(gram, cov, regularization, None, row_norms(X, squared=True), copy_Xy=copy_cov).T else: raise ValueError('Sparse coding method must be "lasso_lars" ' '"lasso_cd", "lasso", "threshold" or "omp", got %s.' % algorithm) return new_code
def k_means_gpu_sparsity(weight_vector, n_clusters, ratio=0.5, verbosity=0, seed=int(time.time()), gpu_id=0): # print(n_clusters) if ratio == 0: return k_means_gpu(weight_vector=weight_vector, n_clusters=n_clusters, verbosity=verbosity, seed=seed, gpu_id=gpu_id) if ratio == 1: if n_clusters == 1: mean_sample = np.mean(weight_vector, axis=0) weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1)) return weight_vector elif weight_vector.shape[0] == n_clusters: return weight_vector else: # mean_sample = np.mean(weight_vector, axis=0) weight_vector_1_mean = np.mean(weight_vector, axis=0) weight_vector_compress = np.zeros( (weight_vector.shape[0], weight_vector.shape[1]), dtype=np.float32) for v in weight_vector.shape[0]: weight_vector_compress[v, :] = weight_vector_1_mean return weight_vector_compress else: if n_clusters == 1: mean_sample = np.mean(weight_vector, axis=0) weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1)) return weight_vector elif weight_vector.shape[0] == n_clusters: return weight_vector elif weight_vector.shape[1] == 1: return k_means_sparsity(weight_vector, n_clusters, ratio, seed=seed) else: # print('n_clusters', n_clusters) # print('weight_vector.shape',weight_vector.shape) # print('kmeans++ init start') num_samples = weight_vector.shape[0] mean_sample = np.mean(weight_vector, axis=0) center_cluster_index = np.argsort( np.linalg.norm(weight_vector - mean_sample, axis=1))[:int(num_samples * ratio)] # weight_vector_1 = weight_vector[min_index, :] weight_vector_1_mean = np.mean( weight_vector[center_cluster_index, :], axis=0) remaining_cluster_index = np.asarray([ i for i in np.arange(num_samples) if i not in center_cluster_index ]) weight_vector_train = weight_vector[remaining_cluster_index, :] # weight_vector_train = [element for i, element in enumerate(weight_vector) if i not in min_index] # weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1)) init_centers = sklearn.cluster.k_means_._k_init( X=weight_vector_train, n_clusters=n_clusters - 1, x_squared_norms=row_norms(weight_vector_train, squared=True), random_state=RandomState(seed)) # # print('kmeans++ init finished') # # print('init_centers.shape',init_centers.shape) centers, labels = kmeans_cuda(samples=weight_vector_train, clusters=n_clusters - 1, init=init_centers, yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity) # print(np.unique(labels, axis=0).shape[0]+1) # centers, labels = kmeans_cuda(samples = weight_vector, clusters = n_clusters, init="k-means++", yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity) # centers, labels = kmeans_cuda(samples = weight_vector, clusters = n_clusters, init="random", yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity) # centers, labels = kmeans_cuda(samples = weight_vector, clusters = n_clusters, init="afk-mc2", yinyang_t=0, seed=seed, device=gpu_id, verbosity=verbosity) weight_vector_compress = np.zeros( (weight_vector.shape[0], weight_vector.shape[1]), dtype=np.float32) for v in center_cluster_index: weight_vector_compress[v, :] = weight_vector_1_mean for i, v in enumerate(remaining_cluster_index): weight_vector_compress[v, :] = centers[labels[i], :] # weight_compress = np.reshape(weight_vector_compress, (filters_num, filters_channel, filters_size, filters_size)) # print(np.unique(weight_vector_compress, axis=0).shape[0]) # print(n_clusters, '\n') # assert np.unique(weight_vector_compress, axis=0).shape[0]==n_clusters, "cluster number mismatch" return weight_vector_compress
def fit(self, X): rng = np.random.RandomState(self.random_state) new_cluster_centers = np.zeros((self.n_clusters, X.shape[1])) n_samples_arrays = np.arange(X.shape[0]) if self.return_cost_per_iteration: self.cost_array_ = np.zeros(self.max_iter) if self.n_clusters > 20: raise ValueError("Group clustering not supported yet") if self.init == "random": old_cluster_centers_ = X[rng.randint(0, X.shape[0], self.n_clusters), :] else: raise ValueError("wait till we support other initializations.") # Run K-Means for the first time. # Don't do cluster.KMeans().fit(X) because of input_validation etc. dot_product = 2 * np.dot(X, old_cluster_centers_.T) cluster_norms = row_norms(old_cluster_centers_, squared=True).reshape(1, -1) self.distances_ = row_norms(X, squared=True).reshape(-1, 1) - dot_product + cluster_norms # Remove the closest and the second closest cluster. upper_and_lower_bounds = np.argpartition(self.distances_, 1, axis=1) self.labels_ = upper_and_lower_bounds[:, 0] self.almost_labels_ = upper_and_lower_bounds[:, 1] self.upper_and_lower_bounds_ = self.distances_[n_samples_arrays.reshape(-1, 1), upper_and_lower_bounds] # Update cluster centers for i in range(self.n_clusters): new_cluster_centers[i] = np.mean(X[self.labels_ == i], axis=0) self.cluster_centers_ = new_cluster_centers for n_iter in range(self.max_iter): if self.return_cost_per_iteration: self.cost_array_[n_iter] = _calculate_cost(X, self.labels_, self.cluster_centers_) # Calculate how much each center has drifted. drift = ((old_cluster_centers_ - self.cluster_centers_)**2).sum(axis=1) if np.sum(drift) < self.tol: break old_cluster_centers_ = np.copy(self.cluster_centers_) # Add the drift to the upper bounds and subtract the drift from the lower bounds. for i in range(self.n_clusters): mask = self.labels_ == i self.upper_and_lower_bounds_[:, 0][mask] += drift[i] self.upper_and_lower_bounds_[:, 1][mask] -= drift[i] # If the previously second_largest_bound is now lesser than the largest bound # set the upper bound to the distance between the largest_bound # This is based on d(old_center, new_center) + d(old_center, X) > d(X, new_center) mask_changed_bounds = self.upper_and_lower_bounds_[:, 1] < self.upper_and_lower_bounds_[:, 0] #XXX: Vectorize? for i in range(self.n_clusters): cluster = self.cluster_centers_[i] new_mask = np.logical_and(mask_changed_bounds, self.labels_ == i) distances = np.sum((X[new_mask] - cluster)**2, axis=1) self.upper_and_lower_bounds_[:, 0][new_mask] = distances # Now we can be sure that the second closest center is actually the closest. # Reassign the labels. mask_changed_bounds = self.upper_and_lower_bounds_[:, 1] < self.upper_and_lower_bounds_[:, 0] tmp = self.labels_[mask_changed_bounds] self.labels_[mask_changed_bounds] = self.almost_labels_[mask_changed_bounds] self.almost_labels_[mask_changed_bounds] = tmp self.upper_and_lower_bounds_[:, 1][mask_changed_bounds] = self.upper_and_lower_bounds_[:, 0][mask_changed_bounds] #XXX: Vectorize? for i in range(self.n_clusters): cluster = self.cluster_centers_[i] new_mask = np.logical_and(mask_changed_bounds, self.labels_ == i) distances = np.sum((X[new_mask] - cluster)**2, axis=1) self.upper_and_lower_bounds_[:, 0][new_mask] = distances # TODO: Optimize this step. for i in range(self.n_clusters): mask = self.labels_ == i self.cluster_centers_[i] = np.mean(X[mask], axis=0) self.n_iter_ = n_iter
def spherical_k_means(X, n_clusters, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, algorithm="auto", return_n_iter=False): """Modified from sklearn.cluster.k_means_.k_means. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) best_inertia = np.infty X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_spherical_kmeans_single_lloyd)( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def mbkmean(self, options, n_clusters, n_init, batch_size, n_iter, n_samples, labels_true, k_means, X): #to do with online MBK_mean #Compute clustering with MiniBatchKMeans mbk = cluster.MiniBatchKMeans(init=self.init, n_clusters=n_clusters, batch_size=batch_size, n_init=10, max_no_improvement=n_iter, verbose=0) #INIT THREADs try: if options[2] == '-pp' or options[3] == '-pp': thread_1 = afficheur('starting threads', labels_true, mbk, k_means, X, n_clusters) thread_1.start() except IndexError: pass try: if options[2] == '-s': #init state n_batches = int(np.ceil(float(n_samples) / batch_size)) max_iter = 100 tol = 0 _, n_features = X.shape old_center_buffer = np.zeros(n_features, dtype=X.dtype) random_state = check_random_state(None) init_size = 3 * batch_size if init_size > n_samples: init_size = n_samples validation_indices = random_state.randint( 0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms = row_norms(X, squared=True) x_squared_norms_valid = x_squared_norms[validation_indices] counts = np.zeros(n_clusters, dtype=np.int32) best_inertia = None cluster_centers = None for init_idx in range(n_init): cluster_centers = cluster._init_centroids( X, n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) batch_inertia, centers_squared_diff = cluster._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=False) _, inertia = cluster._labels_inertia( X_valid, x_squared_norms_valid, cluster_centers) if best_inertia is None or inertia < best_inertia: mbk.cluster_centers_ = cluster_centers mbk.counts_ = counts best_inertia = inertia print('best inertia %d' % best_inertia) while (True): thread_1 = afficheur('starting threads', labels_true, mbk, k_means, X, n_clusters) thread_1.start() t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) thread_1.update(mbk) t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() n_iter = self.input_num("Iterations suivante : ") if n_iter == "stop": return mbk, t_mini_batch break if isinstance(n_iter, int) == False: print('error integer is required !!! type %s' % type(n_iter)) break except IndexError: pass try: if options[2] == '-pp': random_state = check_random_state(None) t0 = time.time() # Sample a minibatch from the full dataset for iteration_idx in range(n_iter - 1): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) thread_1.update(mbk) t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-p': random_state = check_random_state(None) t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-n': t0 = time.time() mbk = mbk.fit(X) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == None: random_state = check_random_state(None) # Sample a minibatch from the full dataset t0 = time.time() for iteration_idx in range(n_iter - 1): minibatch_indices = random_state.randint( 0, n_samples, self.batch_size) mbk = mbk.partial_fit(X, minibatch_indices=minibatch_indices) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-o': n_batches = int(np.ceil(float(n_samples) / batch_size)) max_iter = 100 n_iter = int(max_iter * n_batches) tol = 0 _, n_features = X.shape old_center_buffer = np.zeros(n_features, dtype=X.dtype) try: # print('self.max_iter %d , n_batches %d '%(n_iter,n_batches)) if options[3] == '-pp': #init state random_state = check_random_state(None) init_size = 3 * batch_size if init_size > n_samples: init_size = n_samples validation_indices = random_state.randint( 0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms = row_norms(X, squared=True) x_squared_norms_valid = x_squared_norms[ validation_indices] counts = np.zeros(n_clusters, dtype=np.int32) best_inertia = None cluster_centers = None #Random init with minimum inertia for init_idx in range(n_init): cluster_centers = cluster._init_centroids( X, n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) batch_inertia, centers_squared_diff = cluster._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=False) _, inertia = cluster._labels_inertia( X_valid, x_squared_norms_valid, cluster_centers) if best_inertia is None or inertia < best_inertia: mbk.cluster_centers_ = cluster_centers mbk.counts_ = counts best_inertia = inertia print('best inertia %d' % best_inertia) convergence_context = {} mbk.batch_inertia = batch_inertia mbk.centers_squared_diff = centers_squared_diff t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) tol = self._tolerance(X, tol) thread_1.update(mbk) # Monitor convergence and do early stopping if necessary if cluster._mini_batch_convergence( mbk, iteration_idx, n_iter, tol, n_samples, mbk.centers_squared_diff, mbk.batch_inertia, convergence_context, verbose=mbk.verbose): t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() return mbk, t_mini_batch break elif options[3] == '-p': random_state = check_random_state(None) convergence_context = {} t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) tol = self._tolerance(X, tol) # Monitor convergence and do early stopping if necessary if cluster._mini_batch_convergence( mbk, iteration_idx, n_iter, tol, n_samples, mbk.centers_squared_diff, mbk.batch_inertia, convergence_context, verbose=False): t_mini_batch = time.time() - t0 return mbk, t_mini_batch break except IndexError: pass except IndexError: pass
def __init__(self,K,X,weights): self.K = K self.x_squared_norms = row_norms(X, squared=True) self.X = X self.weights = weights
def k_means_constrained(X, n_clusters, size_min=None, size_max=None, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, return_n_iter=False): """K-Means clustering with minimum and maximum cluster size constraints. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The observations to cluster. size_min : int, optional, default: None Constrain the label assignment so that each cluster has a minimum size of size_min. If None, no constrains will be applied size_max : int, optional, default: None Constrain the label assignment so that each cluster has a maximum size of size_max. If None, no constrains will be applied n_clusters : int The number of clusters to form as well as the number of centroids to generate. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. verbose : boolean, optional Verbosity mode. tol : float, optional The relative increment in the results before declaring convergence. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. copy_x : boolean, optional When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True, then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. return_n_iter : bool, optional Whether or not to return the number of iterations. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). best_n_iter : int Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) # Validate init array if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # subtract of mean of x for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = kmeans_constrained_single( X, n_clusters, size_min=size_min, size_max=size_max, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(kmeans_constrained_single)( X, n_clusters, size_min=size_min, size_max=size_max, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not copy_x: X += X_mean best_centers += X_mean if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def k_means(X, n_clusters, init='k-means++', precompute_distances='auto', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, return_n_iter=False, sample_weight=None): """K-means clustering algorithm. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. precompute_distances : {'auto', True, False} Precompute distances (faster but takes more memory). 'auto' : do not precompute distances if n_samples * n_clusters > 12 million. This corresponds to about 100MB overhead per job using double precision. True : always precompute distances False : never precompute distances tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. copy_x : boolean, optional When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True, then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. return_n_iter : bool, optional Whether or not to return the number of iterations. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). best_n_iter: int Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) best_inertia = np.infty X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) # If the distances are precomputed every job will create a matrix of shape # (n_clusters, n_samples). To stop KMeans from eating up memory we only # activate this if the created matrix is guaranteed to be under 100MB. 12 # million entries consume a little under 100MB if they are of type double. if precompute_distances == 'auto': n_samples = X.shape[0] precompute_distances = (n_clusters * n_samples) < 12e6 elif isinstance(precompute_distances, bool): pass else: raise ValueError("precompute_distances should be 'auto' or True/False" ", but a value of %r was passed" % precompute_distances) # subtract of mean of x for more accurate distance computations if not sp.issparse(X) or hasattr(init, '__array__'): X_mean = X.mean(axis=0) if not sp.issparse(X): # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init = check_array(init, dtype=np.float64, copy=True) _validate_center_shape(X, n_clusters, init) init -= X_mean if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = _kmeans_single( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, precompute_distances=precompute_distances, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state, sample_weight=sample_weight) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_kmeans_single)( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, precompute_distances=precompute_distances, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not copy_x: X += X_mean best_centers += X_mean if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def setup(self): self.X = _china_dataset() self.n_clusters = 64 self.x_squared_norms = row_norms(self.X, squared=True)
def _estimate_log_gaussian_prob(X, means, precisions_chol, covariance_type): """Estimate the log Gaussian probability. Parameters ---------- X : array-like, shape (n_samples, n_features) means : array-like, shape (n_components, n_features) precisions_chol : array-like Cholesky decompositions of the precision matrices. 'full' : shape of (n_components, n_features, n_features) 'tied' : shape of (n_features, n_features) 'diag' : shape of (n_components, n_features) 'spherical' : shape of (n_components,) covariance_type : {'full', 'tied', 'diag', 'spherical'} Returns ------- log_prob : array, shape (n_samples, n_components) """ n_samples, n_features = X.shape n_components, _ = means.shape # det(precision_chol) is half of det(precision) log_det = _compute_log_det_cholesky( precisions_chol, covariance_type, n_features) # print(log_det) if covariance_type == 'full': log_prob = np.empty((n_samples, n_components)) for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)): # import time # t0 = time.time() y = torch.mm(torch.Tensor(X).cuda(), torch.Tensor(prec_chol).cuda()) - \ torch.mv(torch.Tensor(prec_chol).t().cuda(), torch.Tensor(mu).cuda()) # t1 = time.time() log_prob[:, k] = torch.sum(y**2, dim=1).cpu().numpy() # t00 = time.time() # y = np.dot(X, prec_chol) - np.dot(mu, prec_chol) # t11 = time.time() # log_prob[:, k] = np.sum(np.square(y), axis=1) # print(time.time() - t11, t11-t00, 'cpus', t00 - t1, t1 - t0, 'gpus') elif covariance_type == 'tied': log_prob = np.empty((n_samples, n_components)) for k, mu in enumerate(means): y = np.dot(X, precisions_chol) - np.dot(mu, precisions_chol) log_prob[:, k] = np.sum(np.square(y), axis=1) elif covariance_type == 'diag': precisions = precisions_chol ** 2 log_prob = (np.sum((means ** 2 * precisions), 1) - 2. * np.dot(X, (means * precisions).T) + np.dot(X ** 2, precisions.T)) elif covariance_type == 'spherical': precisions = precisions_chol ** 2 log_prob = (np.sum(means ** 2, 1) * precisions - 2 * np.dot(X, means.T * precisions) + np.outer(row_norms(X, squared=True), precisions)) return -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) centers_old = centers + rng.normal(size=centers.shape) centers_old_csr = centers_old.copy() centers_new = np.zeros_like(centers_old) centers_new_csr = np.zeros_like(centers_old_csr) weight_sums = np.zeros(centers_old.shape[0], dtype=X.dtype) weight_sums_csr = np.zeros(centers_old.shape[0], dtype=X.dtype) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) sample_weight = np.ones(X.shape[0], dtype=X.dtype) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = sample_weight[:10] # step 1: compute the dense minibatch update old_inertia = _mini_batch_step( X_mb, x_mb_squared_norms, sample_weight_mb, centers_old, centers_new, weight_sums, np.random.RandomState(0), random_reassign=False, ) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, sample_weight_mb, x_mb_squared_norms, centers_new ) assert new_inertia > 0.0 assert new_inertia < old_inertia # step 2: compute the sparse minibatch update old_inertia_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, sample_weight_mb, centers_old_csr, centers_new_csr, weight_sums_csr, np.random.RandomState(0), random_reassign=False, ) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, centers_new_csr ) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_allclose(centers_new, centers_new_csr) assert_allclose(old_inertia, old_inertia_csr) assert_allclose(new_inertia, new_inertia_csr)
def _daal4py_check(self, X, y_, check_input): #conver to 2d format X = make2d(X) y = make2d(y_) #convet from list type if isinstance(X, list): X = np.asarray(X, np.float64) if isinstance(y, list): y = np.asarray(y, np.float64) _fptype = getFPType(X) #check alpha if self.alpha == 0: warnings.warn( "With alpha=0, this algorithm does not converge " "well. You are advised to use the LinearRegression " "estimator", stacklevel=2) #check precompute if isinstance(self.precompute, np.ndarray): if check_input: check_array(self.precompute, dtype=_fptype) self.precompute = make2d(self.precompute) #only for compliance with Sklearn if self.fit_intercept: X_offset = np.average(X, axis=0, weights=None) if self.normalize: X_scale = row_norms(X) if np.isscalar(X_scale): if X_scale == .0: X_scale = 1. elif isinstance(X_scale, np.ndarray): X_scale[X_scale == 0.0] = 1.0 else: X_scale = np.ones(X.shape[1], dtype=_fptype) else: X_offset = np.zeros(X.shape[1], dtype=_fptype) X_scale = np.ones(X.shape[1], dtype=_fptype) if (self.fit_intercept and not np.allclose(X_offset, np.zeros(X.shape[1])) or self.normalize and not np.allclose(X_scale, np.ones(X.shape[1]))): warnings.warn( "Gram matrix was provided but X was centered" " to fit intercept, " "or X was normalized : recomputing Gram matrix.", UserWarning) else: if self.precompute not in [False, True, 'auto']: raise ValueError("precompute should be one of True, False, " "'auto' or array-like. Got %r" % self.precompute) #check X and y if check_input: X, y = check_X_y(X, y, dtype=[np.float64, np.float32], multi_output=True, y_numeric=True) else: #only for compliance with Sklearn, this assert is not required for DAAL if (X.flags['F_CONTIGUOUS'] == False): raise ValueError("ndarray is not Fortran contiguous") #check selection if self.selection not in ['random', 'cyclic']: raise ValueError("selection should be either random or cyclic.") return X, y
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1, None, random_reassign=False) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1, None, random_reassign=False) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def get_data(dataname, verbose=True): assert dataname in data_bank, 'Dataset name not recognized!' # meta information about the data meta_dict = { 'isSparse': False, 'n_true_classes': None, 'xsn_train': None, 'xsn_test': None } train_lb = None if dataname == 'rcv1': print('Fetching RCV1 data from sklearn') train = fetch_rcv1(subset='test') train = train.data test = fetch_rcv1(subset='train') test = test.data meta_dict['n_true_classes'] = 103 elif dataname == 'mnist': print('Fetching MNIST data from sklearn') mnist = fetch_mldata('MNIST original') data_ind = range(mnist.data.shape[0]) random.shuffle(data_ind) train_ind = data_ind[:60000] test_ind = data_ind[-10000:] train = mnist.data[train_ind, :] test = mnist.data[test_ind, :] meta_dict['n_true_classes'] = 10 elif dataname == 'gauss': """ Synthetic Gaussian """ print('Generating Gaussian blurbs datasets') #centers = [[2, 2], [-2, -2], [2, -2]] #centers = np.asarray(centers) n_s = 7000 gauss, _ = make_blobs(n_samples=n_s, n_features=10, centers=50, center_box=(-30, 30), cluster_std=10.0) data_ind = range(gauss.shape[0]) random.shuffle(data_ind) train_ind = data_ind[:6 * n_s / 7] test_ind = data_ind[-n_s / 7:] train = gauss[train_ind, :] test = gauss[test_ind, :] meta_dict['n_true_classes'] = 50 elif dataname == 'covtype': """ Forest covertype """ print('Fetching forest covertype datasets') cov = fetch_covtype() train = cov.data[:500000] train_lb = cov.target[:500000] test = cov.data[500000:] test_lb = cov.target[500000:] meta_dict['n_true_classes'] = 7 elif dataname == 'cifar10_raw': """ This will make over 4 million data points for training, each with dim 8*8*3 """ # take user input to know which batch to preprocess usr_input = raw_input('Which batch to preprocess: ') # get data home data_home = '/Users/tangch/scikit_learn_data/cifar10' # make this portable in the future os.chdir(data_home) prefix = 'data_batch_' # combine training batches if usr_input != 'test': curr_batch = prefix + str(usr_input) fname = curr_batch dataname = dataname + 'batch_' + str(usr_input) else: fname = data_home + 'test_batch' dataname = dataname + 'test_batch' #print 'Opening '+fname + 'in directory' + os.getcwd() with open(fname, 'rb') as f: dict = pickle.load(f) train = dict['data'] train_lb = dict['labels'] print 'processing batch %s' % fname print '%d by %d training data with %d label' %(train.shape[0],\ train.shape[1], len(train_lb)) os.chdir( '/Users/tangch/Documents/Python_projects/myprojects/mbkm_2016/mbkm' ) # Reduce the dimension of data by random sampling train = train.reshape(train.shape[0], 32, 32, 3) width = 8 height = width # subsampling patches u.a.r. train, train_lb = convsubsample(train, 1, width, height, labels=train_lb) # random shuffle training data #ind = range(train.shape[0]) print 'shuffling dataset randomly' #ind = random.shuffle(ind) #pdb.set_trace() #train = train[ind] #train_lb = train_lb[ind] train = np.random.sample(train, train.shape[0]) train = np.array(train) train_lb = np.random.sample(train_lb, train_lb.shape[0]) train_lb = np.array(train_lb) test = None # get test data/labels #fname = data_home + 'test_batch' #with open(fname,'rb') as f: # dict = pickle.load(f) #test = dict['data'] #test_lb = dict['labels'] # subsampling patches u.a.r. #test, test_lb = convsubsample(test, 1, width,height, labels = test_lb) meta_dict['n_true_classes'] = 10 print 'finished preprocessing current batch' elif dataname == 'cifar10_norm': """ Only works if we have cifar10_raw """ usr_input = raw_input('Which batch to preprocess: ') ### load cifar10_raw fname = 'cifar10_raw' if usr_input != 'test': fname = fname + 'batch_' + str(usr_input) dataname = dataname + 'batch_' + str(usr_input) else: fname = fname + 'test_batch' dataname = dataname + 'test_batch' try: with open(fname, 'rb') as f: train, test, meta = pickle.load(f) except Exception as e: print('Cannot open file ' + fname) ### Normalization train = normalize(train, norm='l2') test = normalize(test, norm='l2') ### meta info extraction meta_dict['n_true_classes'] = meta['n_true_classes'] train_lb = meta['train_lb'] test_lb = meta['test_lb'] elif dataname == 'cifar10_white_norm': """ Only works if we have cifar10_norm """ ### load cifar10_norm fname = 'cifar10_norm' try: with open(fname, 'rb') as f: train_old, test_old, meta = pickle.load(f) except Exception as e: print('Cannot find file ' + fname) train_test = np.vstack((train_old, test_old)) ### Whitening pca = RandomizedPCA(whiten=True) #use approx PCA to save computation train_test = pca.fit_transform(train_test) train = train_test[:train_old.shape[0]] test = train_test[train_old.shape[0]:] ### Extract meta info meta_dict['n_true_classes'] = meta['n_true_classes'] train_lb = meta['train_lb'] test_lb = meta['test_lb'] else: print 'nothing' meta_dict['dataname'] = dataname # add true labels if exists if not train_lb is None: meta_dict['train_lb'] = train_lb #meta_dict['test_lb'] = test_lb # Check if data is sparse if sp.issparse(train): meta_dict['isSparse'] = True print('The %s data is sparse' % dataname) print('%d training data' % train.shape[0]) print('data dimension is %d' % train.shape[1]) if len(train.shape) == 3: print('data has %d channels' % train.shape[2]) if test is None: print 'No test data' else: print('%d test data' % test.shape[0]) print('The number of true classes is %d' % meta_dict['n_true_classes']) # What does this do? train = as_float_array(train, copy=True) if not test is None: test = as_float_array(test, copy=True) # precompute squared norms for faster computation x_squared_norms_tr = row_norms(train, squared=True) meta_dict['xsn_train'] = x_squared_norms_tr if not test is None: x_squared_norms_tt = row_norms(test, squared=True) meta_dict['xsn_test'] = x_squared_norms_tt return train, test, meta_dict
def constraint_kmeans( X, labels, sample_weight, centers, inertia, iter, max_iter, # pylint: disable=W0622 strategy='gain', verbose=0, state=None, learning_rate=1., history=False, fLOG=None): """ Completes the constraint :epkg:`k-means`. @param X features @param labels initialized labels (unused) @param sample_weight sample weight @param centers initialized centers @param inertia initialized inertia (unused) @param iter number of iteration already done @param max_iter maximum of number of iteration @param strategy strategy used to sort observations before mapping them to clusters @param verbose verbose @param state random state @param learning_rate used by strategy `'weights'` @param history return list of centers accross iterations @param fLOG logging function (needs to be specified otherwise verbose has no effects) @return tuple (best_labels, best_centers, best_inertia, iter, all_centers) """ if labels.dtype != numpy.int32: raise TypeError("Labels must be an array of int not '{0}'".format( labels.dtype)) if strategy == 'weights': return _constraint_kmeans_weights(X, labels, sample_weight, centers, inertia, iter, max_iter, verbose=verbose, state=state, learning_rate=learning_rate, history=history, fLOG=fLOG) else: if isinstance(X, DataFrame): X = X.values x_squared_norms = row_norms(X, squared=True) counters = numpy.empty((centers.shape[0], ), dtype=numpy.int32) limit = X.shape[0] // centers.shape[0] leftover = X.shape[0] - limit * centers.shape[0] leftclose = numpy.empty((centers.shape[0], ), dtype=numpy.int32) n_clusters = centers.shape[0] distances_close = numpy.empty((X.shape[0], ), dtype=X.dtype) best_inertia = None best_iter = None all_centers = [] # association _constraint_association(leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=state) if sample_weight is None: sw = numpy.ones((X.shape[0], )) else: sw = sample_weight if scipy.sparse.issparse(X): _centers_fct = _centers_sparse else: _centers_fct = _centers_dense while iter < max_iter: # compute new clusters centers = _centers_fct(X, sw, labels, n_clusters, distances_close) if history: all_centers.append(centers) # association _constraint_association(leftover, counters, labels, leftclose, distances_close, centers, X, x_squared_norms, limit, strategy, state=state) # inertia _, inertia = _labels_inertia_skl(X=X, sample_weight=sw, x_squared_norms=x_squared_norms, centers=centers, distances=distances_close) iter += 1 if verbose and fLOG: fLOG("CKMeans %d/%d inertia=%f" % (iter, max_iter, inertia)) # best option so far? if best_inertia is None or inertia < best_inertia: best_inertia = inertia best_centers = centers.copy() best_labels = labels.copy() best_iter = iter # early stop if (best_inertia is not None and inertia >= best_inertia and iter > best_iter + 5): break return (best_labels, best_centers, best_inertia, None, iter, all_centers)
def spherical_k_means(X, n_clusters, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, algorithm="auto", return_n_iter=False): """Modified from sklearn.cluster.k_means_.k_means. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) best_inertia = np.infty X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_spherical_kmeans_single_lloyd)(X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def calc_sampling_distribution(self): x_squared_norms = row_norms(self.X, squared=True) centers = _init_centroids(self.X, self.n_clusters, self.init, random_state=self.random_state, x_squared_norms=x_squared_norms) sens = sensitivity.kmeans_sensitivity(self.X, self.w, centers, max(np.log(self.n_clusters), 1)) self.p = sens / np.sum(sens)
def func(dat_matrix): x_squared_norms = row_norms(dat_matrix, squared=True) inertias = _labels_inertia(dat_matrix, x_squared_norms, km.cluster_centers_)[1] return inertias
def _k_init(X, n_clusters, random_state, n_local_trials=None): """Init n_clusters seeds according to k-means++ Parameters ----------- X: array or sparse matrix, shape (n_samples, n_features) The data to pick seeds for. To avoid memory copy, the input data should be double precision (dtype=np.float64). n_clusters: integer The number of seeds to choose x_squared_norms: array, shape (n_samples,) Squared Euclidean norm of each data point. random_state: numpy.RandomState The generator used to initialize the centers. n_local_trials: integer, optional The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. Notes ----- Selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. see: Arthur, D. and Vassilvitskii, S. "k-means++: the advantages of careful seeding". ACM-SIAM symposium on Discrete algorithms. 2007 Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, which is the implementation used in the aforementioned paper. """ n_samples, n_features = X.shape centers = np.empty((n_clusters, n_features), dtype=X.dtype) # Modified !!!! # assert x_squared_norms is not None, 'x_squared_norms None in _k_init' x_squared_norms = row_norms(X, squared=True) # Set the number of local seeding trials if none is given if n_local_trials is None: # This is what Arthur/Vassilvitskii tried, but did not report # specific results for other than mentioning in the conclusion # that it helped. n_local_trials = 2 + int(np.log(n_clusters)) # Pick first center randomly center_id = random_state.randint(n_samples) if sp.issparse(X): centers[0] = X[center_id].toarray() else: centers[0] = X[center_id] # Initialize list of closest distances and calculate current potential closest_dist_sq = euclidean_distances(centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, squared=True) current_pot = closest_dist_sq.sum() # Pick the remaining n_clusters-1 points for c in range(1, n_clusters): # Choose center candidates by sampling with probability proportional # to the squared distance to the closest existing center rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals) # Compute distances to center candidates distance_to_candidates = euclidean_distances( X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) # Decide which candidate is the best best_candidate = None best_pot = None best_dist_sq = None for trial in range(n_local_trials): # Compute potential when including center candidate new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidates[trial]) new_pot = new_dist_sq.sum() # Store result if it is the best local trial so far if (best_candidate is None) or (new_pot < best_pot): best_candidate = candidate_ids[trial] best_pot = new_pot best_dist_sq = new_dist_sq # Permanently add best center candidate found in local tries if sp.issparse(X): centers[c] = X[best_candidate].toarray() else: centers[c] = X[best_candidate] current_pot = best_pot closest_dist_sq = best_dist_sq return centers
def convert_sklearn_kmeans(scope, operator, container): """ Computation graph of distances to all centroids for a batch of examples. Note that a centriod is just the center of a cluster. We use ``[]`` to denote the dimension of a variable; for example, ``X[3, 2]`` means that *X* is a *3-by-2* tensor. In addition, for a matrix *X*, $X'$ denotes its transpose. Symbols: * *l*: # of examples. * *n*: # of features per input example. * *X*: input examples, l-by-n tensor. * *C*: centroids, k-by-n tensor. * :math:`C^2`: 2-norm of all centriod vectors, its shape is ``[k]``. * *Y*: 2-norm of difference between examples and centroids, *l-by-k* tensor. The value at i-th row and k-th column row, ``Y[i,k]``,is the distance from example *i* to centroid *k*. * *L*: the id of the nearest centroid for each input example, its shape is ``[l]``. :: .------------------------------------------------------. | | | v X [l, n] --> ReduceSumSquare -> X^2 [l] Gemm (alpha=-2, transB=1) | | |- C [k, n] | | | v `------> Add <-- -2XC' [l, k] | v C^2 [k] --------> Add <----- Z [l, k] | v L [l] <-- ArgMin <-- Y2 [l, k] --> Sqrt --> Y2 [l, k] *scikit-learn* code: :: X = data Y = model.cluster_centers_ XX = row_norms(X, squared=True) YY = row_norms(Y, squared=True) distances = safe_sparse_dot(X, Y.T, dense_output=True) distances *= -2 distances += XX[:, numpy.newaxis] distances += YY[numpy.newaxis, :] numpy.sqrt(distances, out=distances) """ X = operator.inputs[0] out = operator.outputs op = operator.raw_operator opv = container.target_opset C = op.cluster_centers_ input_name = X dtype = guess_numpy_type(X.type) if dtype != np.float64: dtype = np.float32 if type(X.type) == Int64TensorType: x_cast = OnnxCast(X, to=onnx_proto.TensorProto.FLOAT, op_version=opv) input_name = x_cast C2 = row_norms(C, squared=True).astype(dtype) C = C.astype(dtype) rs = OnnxReduceSumSquare(input_name, axes=[1], keepdims=1, op_version=opv) N = X.type.shape[0] if isinstance(N, int): zeros = np.zeros((N, ), dtype=dtype) else: zeros = OnnxMul(rs, np.array([0], dtype=dtype), op_version=opv) z = OnnxAdd(rs, OnnxGemm(input_name, C, zeros, alpha=-2., transB=1, op_version=opv), op_version=opv) y2 = OnnxAdd(C2, z, op_version=opv) ll = OnnxArgMin(y2, axis=1, keepdims=0, output_names=out[:1], op_version=opv) y2s = OnnxSqrt(y2, output_names=out[1:], op_version=opv) ll.add_to(scope, container) y2s.add_to(scope, container)
def normalize(self, X, norm='l2', axis=1, copy=True): """Normalize a dataset along any axis Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default is True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Normalizer` to perform normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_arrays(X, sparse_format=sparse_format, copy=copy)[0] warn_if_not_float(X, 'The normalize function') if axis == 0: X = X.T if sparse.issparse(X): if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) else: if norm == 'l1': norms = np.abs(X).sum(axis=1) norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: X = X.T return X
def normalize(X, norm='l2', axis=1, copy=True): """Scale input vectors individually to unit norm (vector length). Parameters ---------- X : array or scipy.sparse matrix with shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1' or 'l2', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). See also -------- :class:`sklearn.preprocessing.Normalizer` to perform normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`) """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy) warn_if_not_float(X, 'The normalize function') if axis == 0: X = X.T if sparse.issparse(X): X = check_array(X, accept_sparse=sparse_format, dtype=np.float64) if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) else: if norm == 'l1': norms = np.abs(X).sum(axis=1) norms[norms == 0.0] = 1.0 elif norm == 'l2': norms = row_norms(X) norms[norms == 0.0] = 1.0 X /= norms[:, np.newaxis] if axis == 0: X = X.T return X
def fit(self,X,y,sample_weight=None): if not isinstance(self.C, numbers.Number) or self.C < 0: raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: raise ValueError("Maximum number of iteration must be positive;" " got (max_iter=%r)" % self.max_iter) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol=%r)" % self.tol) solver = _check_solver(self.solver, self.penalty, self.dual) if solver in ['newton-cg']: _dtype = [np.float64, np.float32] else: _dtype = np.float64 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C", accept_large_sparse=solver != 'liblinear') self.X_fit_ = X X_k = self._get_kernel(X) check_classification_targets(y) self.classes_ = np.unique(y) n_samples, n_features = X_k.shape multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_)) if solver == 'liblinear': if effective_n_jobs(self.n_jobs) != 1: warnings.warn("'n_jobs' > 1 does not have any effect when" " 'solver' is set to 'liblinear'. Got 'n_jobs'" " = {}.".format(effective_n_jobs(self.n_jobs))) self.coef_, self.intercept_, n_iter_ = _fit_liblinear( X_k, y, self.C, self.fit_intercept, self.intercept_scaling, self.class_weight, self.penalty, self.dual, self.verbose, self.max_iter, self.tol, self.random_state, sample_weight=sample_weight) self.n_iter_ = np.array([n_iter_]) return self if solver in ['sag', 'saga']: max_squared_sum = row_norms(X_k, squared=True).max() else: max_squared_sum = None n_classes = len(self.classes_) classes_ = self.classes_ if n_classes < 2: raise ValueError("This solver needs samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % classes_[0]) if len(self.classes_) == 2: n_classes = 1 classes_ = classes_[1:] if self.warm_start: warm_start_coef = getattr(self, 'coef_', None) else: warm_start_coef = None if warm_start_coef is not None and self.fit_intercept: warm_start_coef = np.append(warm_start_coef, self.intercept_[:, np.newaxis], axis=1) self.coef_ = list() self.intercept_ = np.zeros(n_classes) # Hack so that we iterate only once for the multinomial case. if multi_class == 'multinomial': classes_ = [None] warm_start_coef = [warm_start_coef] if warm_start_coef is None: warm_start_coef = [None] * n_classes path_func = delayed(logistic_regression_path) # The SAG solver releases the GIL so it's more efficient to use # threads for this solver. if solver in ['sag', 'saga']: prefer = 'threads' else: prefer = 'processes' fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, **_joblib_parallel_args(prefer=prefer))( path_func(X_k, y, pos_class=class_, Cs=[self.C], fit_intercept=self.fit_intercept, tol=self.tol, verbose=self.verbose, solver=solver, multi_class=multi_class, max_iter=self.max_iter, class_weight=self.class_weight, check_input=False, random_state=self.random_state, coef=warm_start_coef_, penalty=self.penalty, max_squared_sum=max_squared_sum, sample_weight=sample_weight) for class_, warm_start_coef_ in zip(classes_, warm_start_coef)) fold_coefs_, _, n_iter_ = zip(*fold_coefs_) self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0] if multi_class == 'multinomial': self.coef_ = fold_coefs_[0][0] else: self.coef_ = np.asarray(fold_coefs_) self.coef_ = self.coef_.reshape(n_classes, n_features + int(self.fit_intercept)) if self.fit_intercept: self.intercept_ = self.coef_[:, -1] self.coef_ = self.coef_[:, :-1] return self
def fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. y : Ignored Not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None). Returns ------- self Fitted estimator. """ random_state = check_random_state(self.random_state) n_init = self.n_init if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) if self.max_iter <= 0: raise ValueError( 'Number of iterations should be a positive number,' ' got %d instead' % self.max_iter) # avoid forcing order when copy_x=False order = "C" if self.copy_x else None X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order=order, copy=self.copy_x) # verify that the number of samples given is larger than k if _num_samples(X) < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), self.n_clusters)) tol = _tolerance(X, self.tol) # If the distances are precomputed every job will create a matrix of # shape (n_clusters, n_samples). To stop KMeans from eating up memory # we only activate this if the created matrix is guaranteed to be # under 100MB. 12 million entries consume a little under 100MB if they # are of type double. precompute_distances = self.precompute_distances if precompute_distances == 'auto': n_samples = X.shape[0] precompute_distances = (self.n_clusters * n_samples) < 12e6 elif isinstance(precompute_distances, bool): pass else: raise ValueError( "precompute_distances should be 'auto' or True/False" ", but a value of %r was passed" % precompute_distances) # Validate init array init = self.init if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, self.n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # subtract of mean of x for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None algorithm = self.algorithm if self.n_clusters == 1: # elkan doesn't make sense for a single cluster, full will produce # the right result. algorithm = "full" if algorithm == "auto": algorithm = "full" if sp.issparse(X) else 'elkan' if algorithm == "full": kmeans_single = _fuzzykmeans_single_lloyd elif algorithm == "elkan": kmeans_single = _fuzzykmeans_single_elkan else: raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" " %s" % str(algorithm)) seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) if effective_n_jobs(self.n_jobs) == 1: # For a single thread, less memory is needed if we just store one # set of the best results (as opposed to one set per run per # thread). for seed in seeds: # run a k-means once fuzzy_labels, labels, inertia, centers, n_iter_ = kmeans_single( X, self.m, sample_weight, self.n_clusters, max_iter=self.max_iter, init=init, verbose=self.verbose, precompute_distances=precompute_distances, tol=tol, x_squared_norms=x_squared_norms, random_state=seed) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_fuzzy_labels = fuzzy_labels.copy() best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs results = Parallel(n_jobs=self.n_jobs, verbose=0)( delayed(kmeans_single)( X, self.m, sample_weight, self.n_clusters, max_iter=self.max_iter, init=init, verbose=self.verbose, tol=tol, precompute_distances=precompute_distances, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia fuzzy_labels, labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_fuzzy_labels = fuzzy_labels[best] best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not self.copy_x: X += X_mean best_centers += X_mean distinct_clusters = len(set(best_labels)) if distinct_clusters < self.n_clusters: warnings.warn( "Number of distinct clusters ({}) found smaller than " "n_clusters ({}). Possibly due to duplicate points " "in X.".format(distinct_clusters, self.n_clusters), ConvergenceWarning, stacklevel=2) self.cluster_centers_ = best_centers self.fuzzy_labels_ = best_fuzzy_labels self.labels_ = best_labels self.inertia_ = best_inertia self.n_iter_ = best_n_iter return self
assert indices.shape[0] == n_clusters assert (indices >= 0).all() assert (indices <= data.shape[0]).all() # Check for the correct number of seeds and that they are bound by the data assert centers.shape[0] == n_clusters assert (centers.max(axis=0) <= data.max(axis=0)).all() assert (centers.min(axis=0) >= data.min(axis=0)).all() # Check that indices correspond to reported centers # Use X for comparison rather than data, test still works against centers # calculated with sparse data. assert_allclose(X[indices].astype(dtype), centers) @pytest.mark.parametrize("x_squared_norms", [row_norms(X, squared=True), None]) def test_kmeans_plusplus_norms(x_squared_norms): # Check that defining x_squared_norms returns the same as default=None. centers, indices = kmeans_plusplus(X, n_clusters, x_squared_norms=x_squared_norms) assert_allclose(X[indices], centers) def test_kmeans_plusplus_dataorder(): # Check that memory layout does not effect result centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=0) X_fortran = np.asfortranarray(X)
def _estimate_log_gaussian_prob(x, means, precisions_chol, covariance_type): """Estimate the log Gaussian probability. Parameters ---------- x : array-like or csr_matrix, shape (n_samples, n_features) means : array-like, shape (n_components, n_features) precisions_chol : array-like, Cholesky decompositions of the precision matrices. 'full' : shape of (n_components, n_features, n_features) 'tied' : shape of (n_features, n_features) 'diag' : shape of (n_components, n_features) 'spherical' : shape of (n_components,) covariance_type : {'full', 'tied', 'diag', 'spherical'} Returns ------- log_prob : array, shape (n_samples, n_components) """ n_samples, n_features = x.shape n_components, _ = means.shape # det(precision_chol) is half of det(precision) log_det = _compute_log_det_cholesky(precisions_chol, covariance_type, n_features) if covariance_type == 'full': log_prob = np.empty((n_samples, n_components)) for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)): if issparse(x): y = x.dot(prec_chol) - np.dot(mu, prec_chol) else: y = np.matmul(x, prec_chol) - np.dot(mu, prec_chol) log_prob[:, k] = np.sum(np.square(y), axis=1) elif covariance_type == 'tied': log_prob = np.empty((n_samples, n_components)) for k, mu in enumerate(means): if issparse(x): y = x.dot(precisions_chol) - np.dot(mu, precisions_chol) else: y = np.dot(x, precisions_chol) - np.dot(mu, precisions_chol) log_prob[:, k] = np.sum(np.square(y), axis=1) elif covariance_type == 'diag': precisions = precisions_chol**2 if issparse(x): log_prob = (np.sum((means**2 * precisions), 1) - 2. * (x * (means * precisions).T) + x.multiply(x).dot(precisions.T)) else: log_prob = (np.sum((means**2 * precisions), 1) - 2. * np.dot(x, (means * precisions).T) + np.dot(x**2, precisions.T)) elif covariance_type == 'spherical': precisions = precisions_chol**2 if issparse(x): log_prob = (np.sum(means**2, 1) * precisions - 2 * (x * (means.T * precisions)) + np.outer(row_norms(x, squared=True), precisions)) else: log_prob = (np.sum(means**2, 1) * precisions - 2 * np.dot(x, means.T * precisions) + np.outer(row_norms(x, squared=True), precisions)) else: # pragma: no cover raise ValueError() return -.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X ** 2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step( X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1, None, random_reassign=False) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers) ** 2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1, None, random_reassign=False) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers) ** 2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def _init_w(self, V, X): """ Initialize the topics W. If self.init='k-means++', we use the init method of sklearn.cluster.KMeans. If self.init='random', topics are initialized with a Gamma distribution. If self.init='k-means', topics are initialized with a KMeans on the n-grams counts. """ if self.init == 'k-means++': if LooseVersion(sklearn_version) < LooseVersion('0.24'): W = _k_init(V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W, _ = kmeans_plusplus(V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) W = W + .1 # To avoid restricting topics to few n-grams only elif self.init == 'random': W = self.random_state.gamma(shape=self.gamma_shape_prior, scale=self.gamma_scale_prior, size=(self.n_components, self.n_vocab)) elif self.init == 'k-means': prototypes = get_kmeans_prototypes(X, self.n_components, analyzer=self.analyzer, random_state=self.random_state) W = self.ngrams_count_.transform(prototypes).A + .1 if self.add_words: W2 = self.word_count_.transform(prototypes).A + .1 W = np.hstack((W, W2)) # if k-means doesn't find the exact number of prototypes if W.shape[0] < self.n_components: if LooseVersion(sklearn_version) < LooseVersion('0.24'): W2 = _k_init(V, self.n_components - W.shape[0], x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W2, _ = kmeans_plusplus(V, self.n_components - W.shape[0], x_squared_norms=row_norms( V, squared=True), random_state=self.random_state, n_local_trials=None) W2 = W2 + .1 W = np.concatenate((W, W2), axis=0) else: raise AttributeError('Initialization method %s does not exist.' % self.init) W /= W.sum(axis=1, keepdims=True) A = np.ones((self.n_components, self.n_vocab)) * 1e-10 B = A.copy() return W, A, B