def partial_fit(self, X, y=None, weights=None): """Update k means estimate on a single mini-batch X. Parameters ---------- X: array-like, shape = [n_samples, n_features] Coordinates of the data points to cluster. """ X = check_arrays(X, sparse_format="csr", copy=False)[0] n_samples, n_features = X.shape if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) if n_samples == 0: return self x_squared_norms = _squared_norms(X) self.random_state_ = check_random_state(self.random_state) if (not hasattr(self, 'counts_') or not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers self.cluster_centers_ = _init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms, init_size=self.init_size, weights=weights) self.initial_cluster_centers_ = self.cluster_centers_.copy() self.counts_ = np.zeros(self.n_clusters, dtype=np.int32) random_reassign = False else: # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts random_reassign = self.random_state_.randint( 10 * (1 + self.counts_.min())) == 0 _mini_batch_step(X, x_squared_norms, self.cluster_centers_, self.counts_, np.zeros(0, np.double), 0, random_reassign=random_reassign, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, weights=weights, sphered=self.sphered) if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( X, x_squared_norms, self.cluster_centers_, weights=weights) return self
def fit(self, X, y=None, weights=None): """Compute the centroids on X by chunking it into mini-batches. Parameters ---------- X: array-like, shape = [n_samples, n_features] Coordinates of the data points to cluster """ random_state = check_random_state(self.random_state) if self.k is not None: warnings.warn( "Parameter k has been replaced by 'n_clusters'" " and will be removed in release 0.14.", DeprecationWarning, stacklevel=2) self.n_clusters = self.k X = check_arrays(X, sparse_format="csr", copy=False, check_ccontiguous=True, dtype=np.float64)[0] if sp.issparse(X): raise ValueError("Only dense arrays are supported") n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("Number of samples smaller than number " "of clusters.") if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) x_squared_norms = _squared_norms(X) if self.tol > 0.0: tol = _tolerance(X, self.tol) # using tol-based early stopping needs the allocation of a # dedicated before which can be expensive for high dim data: # hence we allocate it outside of the main loop old_center_buffer = np.zeros(n_features, np.double) else: tol = 0.0 # no need for the center buffer if tol-based early stopping is # disabled old_center_buffer = np.zeros(0, np.double) distances = np.zeros(self.batch_size, dtype=np.float64) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) init_size = self.init_size if init_size is None: init_size = 3 * self.batch_size if init_size > n_samples: init_size = n_samples self.init_size_ = init_size validation_indices = random_state.random_integers( 0, n_samples - 1, init_size) X_valid = X[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] weights_valid = weights[ validation_indices] if weights is not None else None # perform several inits with random sub-sets best_inertia = None for init_idx in range(self.n_init): if self.verbose: print "Init %d/%d with method: %s" % (init_idx + 1, self.n_init, self.init) counts = np.zeros(self.n_clusters, dtype=np.int32) # TODO: once the `k_means` function works with sparse input we # should refactor the following init to use it instead. # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans cluster_centers = _init_centroids(X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size, weights=weights, sphered=self.sphered) # Compute the label assignement on the init dataset batch_inertia, centers_squared_diff = _mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=distances, verbose=self.verbose, weights=weights_valid, sphered=self.sphered) # Keep only the best cluster centers across independent inits on # the common validation set _, inertia = _labels_inertia(X_valid, x_squared_norms_valid, cluster_centers, weights=weights) if self.verbose: print "Inertia for init %d/%d: %f" % (init_idx + 1, self.n_init, inertia) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = counts best_inertia = inertia # Empty context to be used inplace by the convergence check routine convergence_context = {} self.initial_cluster_centers_ = self.cluster_centers_.copy() # Perform the iterative optimization until the final convergence # criterion for iteration_idx in xrange(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.random_integers( 0, n_samples - 1, self.batch_size) minibatch_weights = weights[ minibatch_indices] if weights is not None else None # Perform the actual update step on the minibatch data batch_inertia, centers_squared_diff = _mini_batch_step( X[minibatch_indices], x_squared_norms[minibatch_indices], self.cluster_centers_, self.counts_, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) % (10 + self.counts_.min()) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, weights=minibatch_weights, sphered=self.sphered) # Monitor convergence and do early stopping if necessary if _mini_batch_convergence(self, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, verbose=self.verbose): break if self.compute_labels: if self.verbose: print 'Computing label assignements and total inertia' self.labels_, self.inertia_ = _labels_inertia( X, x_squared_norms, self.cluster_centers_, weights=weights) return self
def fit(self, X, y=None, weights=None): """Compute the centroids on X by chunking it into mini-batches. Parameters ---------- X: array-like, shape = [n_samples, n_features] Coordinates of the data points to cluster """ random_state = check_random_state(self.random_state) if self.k is not None: warnings.warn("Parameter k has been replaced by 'n_clusters'" " and will be removed in release 0.14.", DeprecationWarning, stacklevel=2) self.n_clusters = self.k X = check_arrays(X, sparse_format="csr", copy=False, check_ccontiguous=True, dtype=np.float64)[0] if sp.issparse(X): raise ValueError("Only dense arrays are supported") n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("Number of samples smaller than number " "of clusters.") if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) x_squared_norms = _squared_norms(X) if self.tol > 0.0: tol = _tolerance(X, self.tol) # using tol-based early stopping needs the allocation of a # dedicated before which can be expensive for high dim data: # hence we allocate it outside of the main loop old_center_buffer = np.zeros(n_features, np.double) else: tol = 0.0 # no need for the center buffer if tol-based early stopping is # disabled old_center_buffer = np.zeros(0, np.double) distances = np.zeros(self.batch_size, dtype=np.float64) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) init_size = self.init_size if init_size is None: init_size = 3 * self.batch_size if init_size > n_samples: init_size = n_samples self.init_size_ = init_size validation_indices = random_state.random_integers( 0, n_samples - 1, init_size) X_valid = X[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] weights_valid = weights[validation_indices] if weights is not None else None # perform several inits with random sub-sets best_inertia = None for init_idx in range(self.n_init): if self.verbose: print "Init %d/%d with method: %s" % ( init_idx + 1, self.n_init, self.init) counts = np.zeros(self.n_clusters, dtype=np.int32) # TODO: once the `k_means` function works with sparse input we # should refactor the following init to use it instead. # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans cluster_centers = _init_centroids( X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size, weights=weights, sphered=self.sphered) # Compute the label assignement on the init dataset batch_inertia, centers_squared_diff = _mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=distances, verbose=self.verbose, weights=weights_valid, sphered=self.sphered) # Keep only the best cluster centers across independent inits on # the common validation set _, inertia = _labels_inertia(X_valid, x_squared_norms_valid, cluster_centers, weights=weights) if self.verbose: print "Inertia for init %d/%d: %f" % ( init_idx + 1, self.n_init, inertia) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = counts best_inertia = inertia # Empty context to be used inplace by the convergence check routine convergence_context = {} self.initial_cluster_centers_ = self.cluster_centers_.copy() # Perform the iterative optimization until the final convergence # criterion for iteration_idx in xrange(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.random_integers( 0, n_samples - 1, self.batch_size) minibatch_weights = weights[minibatch_indices] if weights is not None else None # Perform the actual update step on the minibatch data batch_inertia, centers_squared_diff = _mini_batch_step( X[minibatch_indices], x_squared_norms[minibatch_indices], self.cluster_centers_, self.counts_, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) % (10 + self.counts_.min()) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, weights=minibatch_weights, sphered=self.sphered) # Monitor convergence and do early stopping if necessary if _mini_batch_convergence( self, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, verbose=self.verbose): break if self.compute_labels: if self.verbose: print 'Computing label assignements and total inertia' self.labels_, self.inertia_ = _labels_inertia( X, x_squared_norms, self.cluster_centers_, weights=weights) return self