def _init_w(self, V, X): """ Initialize the topics W. If self.init='k-means++', we use the init method of sklearn.cluster.KMeans. If self.init='random', topics are initialized with a Gamma distribution. If self.init='k-means', topics are initialized with a KMeans on the n-grams counts. """ if self.init == 'k-means++': if LooseVersion(sklearn_version) < LooseVersion('0.24'): W = _k_init( V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W, _ = kmeans_plusplus( V, self.n_components, x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) W = W + .1 # To avoid restricting topics to few n-grams only elif self.init == 'random': W = self.random_state.gamma( shape=self.gamma_shape_prior, scale=self.gamma_scale_prior, size=(self.n_components, self.n_vocab)) elif self.init == 'k-means': prototypes = get_kmeans_prototypes( X, self.n_components, random_state=self.random_state) W = self.ngrams_count_.transform(prototypes).A + .1 if self.add_words: W2 = self.word_count_.transform(prototypes).A + .1 W = np.hstack((W, W2)) # if k-means doesn't find the exact number of prototypes if W.shape[0] < self.n_components: if LooseVersion(sklearn_version) < LooseVersion('0.24'): W2 = _k_init( V, self.n_components - W.shape[0], x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) + .1 else: W2, _ = kmeans_plusplus( V, self.n_components - W.shape[0], x_squared_norms=row_norms(V, squared=True), random_state=self.random_state, n_local_trials=None) W2 = W2 + .1 W = np.concatenate((W, W2), axis=0) else: raise AttributeError( 'Initialization method %s does not exist.' % self.init) W /= W.sum(axis=1, keepdims=True) A = np.ones((self.n_components, self.n_vocab)) * 1e-10 B = A.copy() return W, A, B
def test_kmeans_plusplus_dataorder(): # Check that memory layout does not effect result centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=0) X_fortran = np.asfortranarray(X) centers_fortran, _ = kmeans_plusplus(X_fortran, n_clusters, random_state=0) assert_allclose(centers_c, centers_fortran)
def test_kmeans_plusplus_norms(x_squared_norms): # Check that defining x_squared_norms returns the same as default=None. centers, indices = kmeans_plusplus(X, n_clusters, x_squared_norms=x_squared_norms) assert_allclose(X[indices], centers)
def __kmeans_plus_plus_init(self, X): # Initial centers # It chooses the first centroid randomly and the next ones # using a weighted probability p i = cost i /SUM( cost i ), where cost i is # the squared distance of the data point x i to its nearest centroids. self.cluster_centers_, _ = kmeans_plusplus(X, n_clusters=self.n_clusters)
def kmeans_pp(inst): random_state = check_random_state(None) x_squared_norms = row_norms(inst.X, squared=True) centers, indices = kmeans_plusplus(inst.X, inst.p, random_state=random_state, x_squared_norms=x_squared_norms) return indices
def initialize_arg(self, X): ''' Initialize EM algorithm ''' n_samples, n_features = X.shape log_pi = np.log(np.full((1, self.n_clusters), 1 / self.n_clusters)) low, _ = kmeans_plusplus(X, self.n_clusters) high = low scale = np.ones((self.n_clusters, n_features)) / self.n_clusters return {"log_pi": log_pi, "low": low, "high": high, "scale": scale}
def test_kmeans_plusplus_output(data, dtype): # Check for the correct number of seeds and all positive values data = data.astype(dtype) centers, indices = kmeans_plusplus(data, n_clusters) # Check there are the correct number of indices and that all indices are # positive and within the number of samples assert indices.shape[0] == n_clusters assert (indices >= 0).all() assert (indices <= data.shape[0]).all() # Check for the correct number of seeds and that they are bound by the data assert centers.shape[0] == n_clusters assert (centers.max(axis=0) <= data.max(axis=0)).all() assert (centers.min(axis=0) >= data.min(axis=0)).all() # Check that indices correspond to reported centers # Use X for comparison rather than data, test still works against centers # calculated with sparse data. assert_allclose(X[indices].astype(dtype), centers)
def test_kmeans_plusplus_wrong_params(param, match): with pytest.raises(ValueError, match=match): kmeans_plusplus(X, n_clusters, **param)
from sklearn.cluster import kmeans_plusplus from sklearn.datasets import make_blobs import matplotlib.pyplot as plt # Generate sample data n_samples = 4000 n_components = 4 X, y_true = make_blobs(n_samples=n_samples, centers=n_components, cluster_std=0.60, random_state=0) X = X[:, ::-1] # Calculate seeds from kmeans++ centers_init, indices = kmeans_plusplus(X, n_clusters=4, random_state=0) # Plot init seeds along side sample data plt.figure(1) colors = ['#4EACC5', '#FF9C34', '#4E9A06', 'm'] for k, col in enumerate(colors): cluster_data = y_true == k plt.scatter(X[cluster_data, 0], X[cluster_data, 1], c=col, marker='.', s=10) plt.scatter(centers_init[:, 0], centers_init[:, 1], c='b', s=50) plt.title("K-Means++ Initialization") plt.xticks([]) plt.yticks([]) #plt.show()
def update(self, g: RandomWalkGraph, lr=None) -> int: """ update the clusters on a single batch of random walks :param g: the graph being clustered :param lr: (optional) learning rate parameter. If specified, overrides the object's lr property :return: number of float values stored in the weight matrix """ lr = lr or self.lr eq_sample_count = 1 / lr - 1 # get walks walks = [ set( g.unweighted_random_walk( length=random.randint(self.min_len, self.max_len))) for _ in range(self.batch_size) ] # assign new nodes all_states = set().union(*[walk for walk in walks]) unmapped = all_states - set(self.map) if len(unmapped) > 0: self.map.update( dict( zip(unmapped, range(len(self.map), len(self.map) + len(unmapped))))) if self.centers is not None: self.centers.resize(self.n_clusters, len(self.map)) # convert walks to binary x = self._walks_to_matrix(walks) # initialize if this is the first iteration if self.centers is None: self.centers = sparse.csr_matrix( normalize(kmeans_plusplus(x, self.n_clusters)[0], norm='l2', axis=1, copy=False)) # compute distances (dot products) dots = self.centers.dot(x.T) winners = dots.argmax(axis=0).A1 # compute data means by cluster onehot_labels = sparse.csr_matrix( (np.ones(self.batch_size), (winners, np.arange(self.batch_size))), shape=(self.n_clusters, self.batch_size)) cluster_means = onehot_labels.dot(x) cluster_counts = onehot_labels.sum(axis=1).A1 row_indices, _ = cluster_means.nonzero() cluster_means.data /= cluster_counts[row_indices] # derive a learning rate for each cluster, based on number of samples lr_by_cluster = cluster_counts / (eq_sample_count + cluster_counts) # do weighted average cluster_means.data *= lr_by_cluster[row_indices] row_indices, _ = self.centers.nonzero() self.centers.data *= (1 - lr_by_cluster)[row_indices] self.centers += cluster_means # prune self.centers.data[self.centers.data < self.drop_threshold] = 0 self.centers.eliminate_zeros() # reproject cluster centers self.centers = normalize(self.centers, norm='l2', axis=1, copy=False) return len(self.centers.data)