def __init__(self, g_matrix, g_l, g_k, g_first_center, g_l_multipler): ''' :param g_matrix: Term Matrix 입니당 :param g_l: Scalable K mean++ 에서 사용하는 l 값 입니다. initializing 시에 각 횟수 마다 뽑을 Center 후보들의 갯수를 의미합니당 :param g_k: 최종 Center의 갯수를 의미합니당! :param g_first_center: 초기 Center의 갯수를 지정합니다! :param g_l_multipler: 초기 Center 갯수에서 얼마나 많은 최종 Center들을 뽑을 것인지를 결정 하는 값입니다. 이 값이 일정 수준을 넘어 가면 계산량은 적어지지만, 잘못 하면 Center의 갯수가 k개 보다 작아져서 K mean 실행이 불가 하며 상대적으로 작은 값을 뽑으면 보다 좋은 Center 를 얻을 수 있으나, 계산량이 늘어나서 시간이 오래 걸리게 됩니다. !ㅅ! 예시 : k = 350, l = 0.5 중간 결과 : 696개의 Center 선택 K(350)까지 줄이는데 걸린 시간 : 대략 10분 30초 ''' self.matrix = g_matrix self.l = g_l self.k = g_k self.multipler = g_l_multipler self.norm = kmean.row_norms(self.matrix, True) self.fc = g_first_center self.init_center, self.init_center_index = self.PickFirstCenter() self.process_center = self.init_center self.process_center_index = self.init_center_index self.center_distance = 0
def test_fit_given_init(self, X_blobs): X_ = X_blobs.compute() x_squared_norms = k_means_.row_norms(X_, squared=True) rs = np.random.RandomState(0) init = k_means_._k_init(X_, 3, x_squared_norms, rs) dkkm = DKKMeans(3, init=init, random_state=rs) skkm = SKKMeans(3, init=init, random_state=rs, n_init=1) dkkm.fit(X_blobs) skkm.fit(X_) assert_eq(dkkm.inertia_, skkm.inertia_)
def test_fit_given_init(self): X, y = sklearn.datasets.make_blobs(n_samples=1000, n_features=4, random_state=1) X = da.from_array(X, chunks=500) X_ = X.compute() x_squared_norms = k_means_.row_norms(X_, squared=True) rs = np.random.RandomState(0) init = k_means_._k_init(X_, 3, x_squared_norms, rs) dkkm = DKKMeans(3, init=init, random_state=0) skkm = SKKMeans(3, init=init, random_state=0, n_init=1) dkkm.fit(X) skkm.fit(X_) assert_eq(dkkm.inertia_, skkm.inertia_)
def test_row_norms(X_blobs): result = row_norms(X_blobs, squared=True) expected = k_means_.row_norms(X_blobs.compute(), squared=True) assert_eq(result, expected)
def fit(self, X, y=None): """Compute the centroids on X by chunking it into mini-batches. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. y : Ignored """ random_state = check_random_state(self.random_state) X = check_array(X, accept_sparse="csr", order='C', dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("Number of samples smaller than number " "of clusters.") n_init = self.n_init if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=X.dtype) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in MiniBatchKMeans instead of ' 'n_init=%d' % self.n_init, RuntimeWarning, stacklevel=2) n_init = 1 x_squared_norms = k_means_.row_norms(X, squared=True) if self.tol > 0.0: tol = k_means_._tolerance(X, self.tol) # using tol-based early stopping needs the allocation of a # dedicated before which can be expensive for high dim data: # hence we allocate it outside of the main loop old_center_buffer = np.zeros(n_features, dtype=X.dtype) else: tol = 0.0 # no need for the center buffer if tol-based early stopping is # disabled old_center_buffer = np.zeros(0, dtype=X.dtype) distances = np.zeros(self.batch_size, dtype=X.dtype) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) init_size = self.init_size if init_size is None: init_size = 3 * self.batch_size if init_size > n_samples: init_size = n_samples self.init_size_ = init_size validation_indices = random_state.randint(0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] # perform several inits with random sub-sets best_inertia = None for init_idx in range(n_init): if self.verbose: print("Init %d/%d with method: %s" % (init_idx + 1, n_init, self.init)) counts = np.zeros(self.n_clusters, dtype=np.int32) # TODO: once the `k_means` function works with sparse input we # should refactor the following init to use it instead. # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans cluster_centers = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) # Compute the label assignment on the init dataset batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=self.verbose) # Keep only the best cluster centers across independent inits on # the common validation set _, inertia = k_means_._labels_inertia(X_valid, x_squared_norms_valid, cluster_centers) if self.verbose: print("Inertia for init %d/%d: %f" % (init_idx + 1, n_init, inertia)) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = counts best_inertia = inertia # Empty context to be used inplace by the convergence check routine convergence_context = {} # Perform the iterative optimization until the final convergence # criterion for iteration_idx in range(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint(0, n_samples, self.batch_size) # Perform the actual update step on the minibatch data batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X[minibatch_indices], x_squared_norms[minibatch_indices], self.cluster_centers_, self.counts_, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) % (10 + self.counts_.min()) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) # Monitor convergence and do early stopping if necessary if k_means_._mini_batch_convergence(self, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, verbose=self.verbose): break self.n_iter_ = iteration_idx + 1 if self.compute_labels: self.labels_, self.inertia_ = self._labels_inertia_minibatch(X) return self
def fit(self, X, y, sample_weight=None): """Compute k-means-- clustering. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. y : Ignored Not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None). Returns ------- self Fitted estimator. """ random_state = check_random_state(self.random_state) n_init = self.n_init if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) if self.max_iter <= 0: raise ValueError( "Number of iterations should be a positive number," " got %d instead" % self.max_iter) # avoid forcing order when copy_x=False order = "C" if self.copy_x else None X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=self.copy_x) # verify that the number of samples given is larger than k if _num_samples(X) < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), self.n_clusters)) tol = _tolerance(X, self.tol) # If the distances are precomputed every job will create a matrix of # shape (n_clusters, n_samples). To stop KMeans from eating up memory # we only activate this if the created matrix is guaranteed to be # under 100MB. 12 million entries consume a little under 100MB if they # are of type double. precompute_distances = self.precompute_distances if precompute_distances == "auto": n_samples = X.shape[0] precompute_distances = (self.n_clusters * n_samples) < 12e6 elif isinstance(precompute_distances, bool): pass else: raise ValueError( "precompute_distances should be 'auto' or True/False" ", but a value of %r was passed" % precompute_distances) # Validate init array init = self.init if hasattr(init, "__array__"): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, self.n_clusters, init) if n_init != 1: warnings.warn( "Explicit initial center position passed: " "performing only one init in k-means instead of n_init=%d" % n_init, RuntimeWarning, stacklevel=2, ) n_init = 1 # subtract of mean of x for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, "__array__"): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None kmeans_single = _k_means_minus_minus seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) if effective_n_jobs(self.n_jobs) == 1: # For a single thread, less memory is needed if we just store one # set of the best results (as opposed to one set per run per # thread). for seed in seeds: # run a k-means once labels, inertia, centers, n_iter_ = kmeans_single( X, sample_weight, self.n_clusters, self.prop_outliers, max_iter=self.max_iter, init=init, verbose=self.verbose, precompute_distances=precompute_distances, tol=tol, x_squared_norms=x_squared_norms, random_state=seed, ) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs results = Parallel(n_jobs=self.n_jobs, verbose=0)( delayed(kmeans_single)( X, sample_weight, self.n_clusters, self.prop_outliers, max_iter=self.max_iter, init=init, verbose=self.verbose, tol=tol, precompute_distances=precompute_distances, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed, ) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not self.copy_x: X += X_mean best_centers += X_mean distinct_clusters = len(set(best_labels)) if distinct_clusters < self.n_clusters: warnings.warn( "Number of distinct clusters ({}) found smaller than " "n_clusters ({}). Possibly due to duplicate points " "in X.".format(distinct_clusters, self.n_clusters), ConvergenceWarning, stacklevel=2, ) self.cluster_centers_ = best_centers self.labels_ = best_labels self.inertia_ = best_inertia self.n_iter_ = best_n_iter return self