Пример #1
0
 def _init_w(self, V, X):
     """
     Initialize the topics W.
     If self.init='k-means++', we use the init method of
     sklearn.cluster.KMeans.
     If self.init='random', topics are initialized with a Gamma
     distribution.
     If self.init='k-means', topics are initialized with a KMeans on the
     n-grams counts.
     """
     if self.init == 'k-means++':
         if LooseVersion(sklearn_version) < LooseVersion('0.24'):
             W = _k_init(
                 V, self.n_components,
                 x_squared_norms=row_norms(V, squared=True),
                 random_state=self.random_state,
                 n_local_trials=None) + .1
         else:
             W, _ = kmeans_plusplus(
                 V, self.n_components,
                 x_squared_norms=row_norms(V, squared=True),
                 random_state=self.random_state,
                 n_local_trials=None)
             W = W + .1 # To avoid restricting topics to few n-grams only
     elif self.init == 'random':
         W = self.random_state.gamma(
             shape=self.gamma_shape_prior, scale=self.gamma_scale_prior,
             size=(self.n_components, self.n_vocab))
     elif self.init == 'k-means':
         prototypes = get_kmeans_prototypes(
             X, self.n_components, random_state=self.random_state)
         W = self.ngrams_count_.transform(prototypes).A + .1
         if self.add_words:
             W2 = self.word_count_.transform(prototypes).A + .1
             W = np.hstack((W, W2))
         # if k-means doesn't find the exact number of prototypes
         if W.shape[0] < self.n_components:
             if LooseVersion(sklearn_version) < LooseVersion('0.24'):
                 W2 = _k_init(
                     V, self.n_components - W.shape[0],
                     x_squared_norms=row_norms(V, squared=True),
                     random_state=self.random_state,
                     n_local_trials=None) + .1
             else:
                 W2, _ = kmeans_plusplus(
                     V, self.n_components - W.shape[0],
                     x_squared_norms=row_norms(V, squared=True),
                     random_state=self.random_state,
                     n_local_trials=None)
                 W2 = W2 + .1
             W = np.concatenate((W, W2), axis=0)
     else:
         raise AttributeError(
             'Initialization method %s does not exist.' % self.init)
     W /= W.sum(axis=1, keepdims=True)
     A = np.ones((self.n_components, self.n_vocab)) * 1e-10
     B = A.copy()
     return W, A, B
Пример #2
0
def test_kmeans_plusplus_dataorder():
    # Check that memory layout does not effect result
    centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=0)

    X_fortran = np.asfortranarray(X)

    centers_fortran, _ = kmeans_plusplus(X_fortran, n_clusters, random_state=0)

    assert_allclose(centers_c, centers_fortran)
Пример #3
0
def test_kmeans_plusplus_norms(x_squared_norms):
    # Check that defining x_squared_norms returns the same as default=None.
    centers, indices = kmeans_plusplus(X,
                                       n_clusters,
                                       x_squared_norms=x_squared_norms)

    assert_allclose(X[indices], centers)
Пример #4
0
 def __kmeans_plus_plus_init(self, X):
     # Initial centers
     # It chooses the first centroid randomly and the next ones
     # using a weighted probability p i = cost i /SUM( cost i ), where cost i is
     # the squared distance of the data point x i to its nearest centroids.
     self.cluster_centers_, _ = kmeans_plusplus(X,
                                                n_clusters=self.n_clusters)
Пример #5
0
def kmeans_pp(inst):

    random_state = check_random_state(None)
    x_squared_norms = row_norms(inst.X, squared=True)
    centers, indices = kmeans_plusplus(inst.X,
                                       inst.p,
                                       random_state=random_state,
                                       x_squared_norms=x_squared_norms)

    return indices
    def initialize_arg(self, X):
        '''
        Initialize EM algorithm
        '''
        n_samples, n_features = X.shape

        log_pi = np.log(np.full((1, self.n_clusters), 1 / self.n_clusters))

        low, _ = kmeans_plusplus(X, self.n_clusters)
        high = low
        scale = np.ones((self.n_clusters, n_features)) / self.n_clusters

        return {"log_pi": log_pi, "low": low, "high": high, "scale": scale}
Пример #7
0
def test_kmeans_plusplus_output(data, dtype):
    # Check for the correct number of seeds and all positive values
    data = data.astype(dtype)
    centers, indices = kmeans_plusplus(data, n_clusters)

    # Check there are the correct number of indices and that all indices are
    # positive and within the number of samples
    assert indices.shape[0] == n_clusters
    assert (indices >= 0).all()
    assert (indices <= data.shape[0]).all()

    # Check for the correct number of seeds and that they are bound by the data
    assert centers.shape[0] == n_clusters
    assert (centers.max(axis=0) <= data.max(axis=0)).all()
    assert (centers.min(axis=0) >= data.min(axis=0)).all()

    # Check that indices correspond to reported centers
    # Use X for comparison rather than data, test still works against centers
    # calculated with sparse data.
    assert_allclose(X[indices].astype(dtype), centers)
Пример #8
0
def test_kmeans_plusplus_wrong_params(param, match):
    with pytest.raises(ValueError, match=match):
        kmeans_plusplus(X, n_clusters, **param)
from sklearn.cluster import kmeans_plusplus
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# Generate sample data
n_samples = 4000
n_components = 4

X, y_true = make_blobs(n_samples=n_samples,
                       centers=n_components,
                       cluster_std=0.60,
                       random_state=0)
X = X[:, ::-1]

# Calculate seeds from kmeans++
centers_init, indices = kmeans_plusplus(X, n_clusters=4,
                                        random_state=0)

# Plot init seeds along side sample data
plt.figure(1)
colors = ['#4EACC5', '#FF9C34', '#4E9A06', 'm']

for k, col in enumerate(colors):
    cluster_data = y_true == k
    plt.scatter(X[cluster_data, 0], X[cluster_data, 1],
                c=col, marker='.', s=10)

plt.scatter(centers_init[:, 0], centers_init[:, 1], c='b', s=50)
plt.title("K-Means++ Initialization")
plt.xticks([])
plt.yticks([])
#plt.show()
Пример #10
0
    def update(self, g: RandomWalkGraph, lr=None) -> int:
        """
        update the clusters on a single batch of random walks
        :param g: the graph being clustered
        :param lr: (optional) learning rate parameter. If specified, overrides the object's lr property
        :return: number of float values stored in the weight matrix
        """

        lr = lr or self.lr
        eq_sample_count = 1 / lr - 1

        # get walks
        walks = [
            set(
                g.unweighted_random_walk(
                    length=random.randint(self.min_len, self.max_len)))
            for _ in range(self.batch_size)
        ]

        # assign new nodes
        all_states = set().union(*[walk for walk in walks])
        unmapped = all_states - set(self.map)
        if len(unmapped) > 0:
            self.map.update(
                dict(
                    zip(unmapped,
                        range(len(self.map),
                              len(self.map) + len(unmapped)))))
            if self.centers is not None:
                self.centers.resize(self.n_clusters, len(self.map))

        # convert walks to binary
        x = self._walks_to_matrix(walks)

        # initialize if this is the first iteration
        if self.centers is None:
            self.centers = sparse.csr_matrix(
                normalize(kmeans_plusplus(x, self.n_clusters)[0],
                          norm='l2',
                          axis=1,
                          copy=False))

        # compute distances (dot products)
        dots = self.centers.dot(x.T)
        winners = dots.argmax(axis=0).A1

        # compute data means by cluster
        onehot_labels = sparse.csr_matrix(
            (np.ones(self.batch_size), (winners, np.arange(self.batch_size))),
            shape=(self.n_clusters, self.batch_size))
        cluster_means = onehot_labels.dot(x)
        cluster_counts = onehot_labels.sum(axis=1).A1
        row_indices, _ = cluster_means.nonzero()
        cluster_means.data /= cluster_counts[row_indices]

        # derive a learning rate for each cluster, based on number of samples
        lr_by_cluster = cluster_counts / (eq_sample_count + cluster_counts)

        # do weighted average
        cluster_means.data *= lr_by_cluster[row_indices]
        row_indices, _ = self.centers.nonzero()
        self.centers.data *= (1 - lr_by_cluster)[row_indices]
        self.centers += cluster_means

        # prune
        self.centers.data[self.centers.data < self.drop_threshold] = 0
        self.centers.eliminate_zeros()

        # reproject cluster centers
        self.centers = normalize(self.centers, norm='l2', axis=1, copy=False)

        return len(self.centers.data)