Пример #1
0
    def fit(self, X):
        x_squared_norms = row_norms(X, squared=True)
        rng = np.random.RandomState(self.random_state)

        if self.init == "kmeans++":
            # Private function of sklearn.cluster.k_means_, to get the initial centers.
            init_centers = _k_init(X, self.n_clusters, x_squared_norms, rng)
        elif self.init == "random":
            random_samples = rng.random_integers(0, X.shape[0], size=self.n_clusters)
            init_centers = X[random_samples, :]
        else:
            raise ValueError("init should be either kmeans++ or random")

        # Assign initial labels. skip norm of x**2
        init_distances = np.sum(init_centers**2, axis=1) - 2 * np.dot(X, init_centers.T)
        init_labels = np.argmin(init_distances, axis=1)
        self.labels_ = init_labels

        self.centers_ = init_centers
        self.n_samples_ = np.zeros(self.n_clusters)

        # Count the number of samples in each cluster.
        for i in range(self.n_clusters):
            self.n_samples_[i] = np.sum(self.labels_ == i)

        for i, (sample, label) in enumerate(zip(X, self.labels_)):
            curr_label = label
            max_cost = np.inf
            while max_cost > 0:
                distances = x_squared_norms[i] - 2 * np.dot(sample, self.centers_.T) + np.sum(self.centers_**2, axis=1)

                curr_distance = distances[curr_label]
                other_distance = np.delete(distances, curr_label)
                curr_n_samples = self.n_samples_[curr_label]
                other_n_samples = np.delete(self.n_samples_, curr_label)
                cost = (curr_n_samples / (curr_n_samples - 1) * curr_distance) - (other_n_samples / (other_n_samples + 1) * other_distance)
                max_cost_ind = np.argmax(cost)
                max_cost = cost[max_cost_ind]

                if max_cost > 0:
                    # We deleted the label index from other_n_samples
                    if max_cost_ind > curr_label:
                        max_cost_ind += 1

                    # Reassign the clusters
                    self.labels_[i] = max_cost_ind

                    self.centers_[curr_label] = (curr_n_samples * self.centers_[curr_label] - sample) / (curr_n_samples - 1)
                    moved_n_samples = self.n_samples_[max_cost_ind]
                    self.centers_[max_cost_ind] = (moved_n_samples * self.centers_[max_cost_ind] + sample) / (moved_n_samples + 1)
                    self.n_samples_[curr_label] -= 1
                    self.n_samples_[max_cost_ind] += 1
                    curr_label = max_cost_ind
Пример #2
0
    def _fit_one_init(self, X, x_squared_norms, rs):
        n_ts, _, d = X.shape
        sz = min([ts_size(ts) for ts in X])
        self.cluster_centers_ = _k_init(X[:, :sz, :].reshape(
            (n_ts, -1)), self.n_clusters, x_squared_norms, rs).reshape(
                (-1, sz, d))
        old_inertia = numpy.inf

        for it in range(self.max_iter):
            self._assign(X)
            if self.verbose:
                print("%.3f" % self.inertia_, end=" --> ")
            self._update_centroids(X)

            if numpy.abs(old_inertia - self.inertia_) < self.tol:
                break
            old_inertia = self.inertia_
        if self.verbose:
            print("")

        return self
Пример #3
0
    def _fit_one_init(self, X, x_squared_norms, rs):
        n_ts, sz, d = time_series_dataset_shape(X)
        if check_equal_size(X):
            X_ = to_equal_sized_dataset(X)
        else:
            X_ = TimeSeriesResampler(sz=sz).fit_transform(X)
        self.cluster_centers_ = _k_init(X_.reshape(
            (n_ts, -1)), self.n_clusters, x_squared_norms, rs).reshape(
                (-1, sz, d))
        old_inertia = numpy.inf

        for it in range(self.max_iter):
            self._assign(X)
            if self.verbose:
                print("%.3f" % self.inertia_, end=" --> ")
            self._update_centroids(X)

            if numpy.abs(old_inertia - self.inertia_) < self.tol:
                break
            old_inertia = self.inertia_
        if self.verbose:
            print("")

        return self
Пример #4
0
def kmeans(method,
           X,
           C,
           p,
           km_tol=1e-2,
           gd_tol=1e-3,
           initial_gd_step_size=0.05,
           num_reduction=3,
           batch_size=512):
    """
    only support gradient_descent and minibatch_gradient_descent for now
    """

    squared_norm = (X**2).sum(1)
    centers = _k_init(X, C, squared_norm, np.random.RandomState())

    dist = np.zeros([len(X), C])
    for i, c in enumerate(centers):
        dist[:, i] = compute_distance(c, X, p)
    assign = dist.argmin(1)

    stop = False
    km_ct = 0
    prev_mse = 1000
    reduce_count = 0
    gd_step_size = initial_gd_step_size
    cumu_difference = []
    while not stop:

        # print("KM Iteration", km_ct)
        # b = time.time()
        # update centers
        total_mse = 0
        new = []
        num_non_empty = 0
        for I in range(C):
            mask = assign == I
            # print(I, "Number of Points", mask.sum())
            if mask.sum() == 0:
                continue  # skip empty cluster
            num_non_empty += 1

            if mask.sum() == 1:  # cluster of only one point
                # print("Only One Point, Skip")
                newc = X[mask]
                new.append([I, newc])
                continue

            x0 = X[mask]
            c = centers[I]
            if method == "gd":
                newc, mse, ct, diff = gradient_descent(c,
                                                       x0,
                                                       p,
                                                       step_size=gd_step_size,
                                                       max_step=2000,
                                                       eps=gd_tol)
            elif method == "sgd":
                newc, mse, ct, diff = minibatch_gradient_descent(
                    c,
                    x0,
                    p,
                    batch_size=batch_size,
                    step_size=gd_step_size,
                    max_step=2000,
                    eps=gd_tol)
            elif method == "mean":
                newc = x0.mean(0)
                mse = compute_distance(newc, x0, p).mean()
                ct = 0
                diff = 0

            new.append([I, newc])
            total_mse += mse

        for i, c in new:
            centers[i] = c

        # compute new distance and assignment
        dist = np.zeros([len(X), C])
        for i, c in enumerate(centers):
            dist[:, i] = compute_distance(c, X, p)
        assign = dist.argmin(1)

        # record difference
        total_mse = total_mse / num_non_empty
        abs_diff = prev_mse - total_mse
        cumu_difference.append(abs_diff)
        prev_mse = total_mse

        # stop criterion
        diff_mean = np.abs(np.mean(cumu_difference[-8:]))
        if (diff_mean < km_tol):  # and assign_diff < 0.001 :
            # print( cumu_difference[-8:] )
            if reduce_count >= num_reduction:
                stop = True
                # print("Reached Reduce 3 times, Breakout")
            # print("Update Small Enough, Reduce GD Step Size") # reduce learning rate to improve
            gd_step_size = gd_step_size / 5
            gd_tol = gd_tol / 5
            km_tol = km_tol / 5
            reduce_count += 1

        km_ct += 1

        # e = time.time()
        # print("Duration", (e-b)/60)

    mse = 0
    for i in range(C):
        mask = assign == I
        mse += dist[mask, I].mean()
    mse = mse / C

    return centers, mse, assign, ct
def kmeans(data,
           K,
           p,
           normalized,
           eps=1e-4,
           optim_method="rprop",
           step_size=0.001,
           batch_size=None,
           mean_init=True,
           max_km_iteraton=100,
           rs=None,
           gpu=False):
    """
    if batch_size is not None, do Minibatch KMeans. (Not recommended, does not provide speedup)
    """

    model = Find_Center(dim=data.shape[-1], p=p, normalized=normalized)
    squared_norm = (data**2).sum(axis=1)
    centers = _k_init(data, K, squared_norm, np.random.RandomState(rs))
    diff = 10000
    prev = 1000
    ct = 0

    centers = torch.FloatTensor(centers)
    data = torch.FloatTensor(data)
    if gpu:
        centers = centers.cuda()
        data = data.cuda()
        model.cuda()

    best_mse = [1e6, 0]
    while diff / prev > eps and ct < max_km_iteraton:
        ct += 1
        # print('iter', ct)

        if batch_size is not None:
            idx = np.random.choice(len(data), batch_size, replace=False)
            data_batch = data[idx]
        else:
            data_batch = data

        # compute assignment
        all_dist = []
        for k in range(K):
            dist = torch_dp(centers[k], data_batch, p, normalized)
            all_dist.append(dist)

        value, assign = torch.stack(all_dist, 0).min(axis=0)

        # update centers
        average_mse = 0  # intra-cluster distance, similar to mean square error for euclidean distance
        track_niter = []
        for k in range(K):
            mask = assign == k
            if mask.sum() == 0:  # skip empty cluster
                continue

            if mask.sum() == 1:
                centers[k] = data_batch[mask][0]
                continue

            d = data_batch[mask]  # data assigned to cluster k

            if mean_init:
                init_c = d.mean(axis=0)  # use mean to initialize
            else:
                init_c = centers[k]

            if optim_method == "mean":
                new_c = d.mean(axis=0)
                se = torch_dp(new_c, d, p, normalized)
                niter, diff = 0, 0
            else:
                new_c, se, niter, diff = gradient_descent_iteration(
                    init_c,
                    d,
                    p,
                    model,
                    optim_method=optim_method,
                    eps=eps,
                    step_size=step_size,
                    max_step=4000)
            centers[k] = new_c

            track_niter.append(niter)
            # all_niters.append(niter)
            if torch.isnan(new_c).any():
                print("Nan!!", k, new_c)
            average_mse += se.sum()

        average_mse = average_mse / data_batch.shape[0]
        diff = torch.abs(average_mse - prev)
        prev = average_mse

        ### Early stop if mse stops decreasing for 10 iterations
        # print(ct, average_mse)
        if average_mse < best_mse[0]:
            best_mse = [average_mse, ct]
        if ct > best_mse[1] + 10:
            print("k-means early stop!")
            break

    # if minibatch, compute all data distance
    if batch_size is not None:
        all_dist = []
        for k in range(K):
            dist = torch_dp(centers[k], data, p, normalized)
            all_dist.append(dist)
        value, assign = torch.stack(all_dist, 0).min(axis=0)
        average_mse = value.mean()

    centers = centers.cpu().detach().numpy()
    average_mse = average_mse.item()
    assign = assign.cpu().detach().numpy()

    return centers, average_mse, assign
Пример #6
0
def _init_centroids(X,
                    k,
                    init,
                    random_state=None,
                    x_squared_norms=None,
                    init_size=None,
                    weights=None,
                    sphered=False):
    """Compute the initial centroids

    Parameters
    ----------

    X: array, shape (n_samples, n_features)

    k: int
        number of centroids

    init: {'k-means++', 'random' or ndarray or callable} optional
        Method for initialization

    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    x_squared_norms:  array, shape (n_samples,), optional
        Squared euclidean norm of each data point. Pass it if you have it at
        hands already to avoid it being recomputed here. Default: None

    init_size : int, optional
        Number of samples to randomly sample for speeding up the
        initialization (sometimes at the expense of accurracy): the
        only algorithm is initialized by running a batch KMeans on a
        random subset of the data. This needs to be larger than k.

    Returns
    -------
    centers: array, shape(k, n_features)
    """
    random_state = check_random_state(random_state)
    n_samples = X.shape[0]

    if init_size is not None and init_size < n_samples:
        if init_size < k:
            warnings.warn("init_size=%d should be larger than k=%d. "
                          "Setting it to 3*k" % (init_size, k),
                          RuntimeWarning,
                          stacklevel=2)
            init_size = 3 * k
        init_indices = random_state.random_integers(0, n_samples - 1,
                                                    init_size)
        X = X[init_indices]
        weights = weights[init_indices] if weights is not None else None
        x_squared_norms = x_squared_norms[init_indices]
        n_samples = X.shape[0]
    elif n_samples < k:
        raise ValueError("n_samples=%d should be larger than k=%d" %
                         (n_samples, k))

    if init == 'k-means++':
        assert weights is None and not sphered, "k-means++ initialization is not supported for weighted or sphered data."
        centers = _k_init(X,
                          k,
                          random_state=random_state,
                          x_squared_norms=x_squared_norms)
    elif init == 'random':
        seeds = random_state.permutation(n_samples)[:k]
        centers = X[seeds]
    elif hasattr(init, '__array__'):
        centers = init
    elif callable(init):
        centers = init(X, k, random_state=random_state, weights=weights)
    else:
        raise ValueError("the init parameter for the k-means should "
                         "be 'k-means++' or 'random' or an ndarray, "
                         "'%s' (type '%s') was passed." % (init, type(init)))

    if sp.issparse(centers):
        centers = centers.toarray()

    if len(centers) != k:
        raise ValueError('The shape of the inital centers (%s) '
                         'does not match the number of clusters %i' %
                         (centers.shape, k))

    return centers
Пример #7
0
 def _init_w(self, V, X):
     """
     Initialize the topics W.
     If self.init='k-means++', we use the init method of
     sklearn.cluster.KMeans.
     If self.init='random', topics are initialized with a Gamma
     distribution.
     If self.init='k-means', topics are initialized with a KMeans on the
     n-grams counts.
     """
     if self.init == 'k-means++':
         if LooseVersion(sklearn_version) < LooseVersion('0.24'):
             W = _k_init(V,
                         self.n_components,
                         x_squared_norms=row_norms(V, squared=True),
                         random_state=self.random_state,
                         n_local_trials=None) + .1
         else:
             W, _ = kmeans_plusplus(V,
                                    self.n_components,
                                    x_squared_norms=row_norms(V,
                                                              squared=True),
                                    random_state=self.random_state,
                                    n_local_trials=None)
             W = W + .1  # To avoid restricting topics to few n-grams only
     elif self.init == 'random':
         W = self.random_state.gamma(shape=self.gamma_shape_prior,
                                     scale=self.gamma_scale_prior,
                                     size=(self.n_components, self.n_vocab))
     elif self.init == 'k-means':
         prototypes = get_kmeans_prototypes(X,
                                            self.n_components,
                                            random_state=self.random_state)
         W = self.ngrams_count_.transform(prototypes).A + .1
         if self.add_words:
             W2 = self.word_count_.transform(prototypes).A + .1
             W = np.hstack((W, W2))
         # if k-means doesn't find the exact number of prototypes
         if W.shape[0] < self.n_components:
             if LooseVersion(sklearn_version) < LooseVersion('0.24'):
                 W2 = _k_init(V,
                              self.n_components - W.shape[0],
                              x_squared_norms=row_norms(V, squared=True),
                              random_state=self.random_state,
                              n_local_trials=None) + .1
             else:
                 W2, _ = kmeans_plusplus(V,
                                         self.n_components - W.shape[0],
                                         x_squared_norms=row_norms(
                                             V, squared=True),
                                         random_state=self.random_state,
                                         n_local_trials=None)
                 W2 = W2 + .1
             W = np.concatenate((W, W2), axis=0)
     else:
         raise AttributeError('Initialization method %s does not exist.' %
                              self.init)
     W /= W.sum(axis=1, keepdims=True)
     A = np.ones((self.n_components, self.n_vocab)) * 1e-10
     B = A.copy()
     return W, A, B
Пример #8
0
def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
                    init_size=None, weights=None, sphered=False):
    """Compute the initial centroids

    Parameters
    ----------

    X: array, shape (n_samples, n_features)

    k: int
        number of centroids

    init: {'k-means++', 'random' or ndarray or callable} optional
        Method for initialization

    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    x_squared_norms:  array, shape (n_samples,), optional
        Squared euclidean norm of each data point. Pass it if you have it at
        hands already to avoid it being recomputed here. Default: None

    init_size : int, optional
        Number of samples to randomly sample for speeding up the
        initialization (sometimes at the expense of accurracy): the
        only algorithm is initialized by running a batch KMeans on a
        random subset of the data. This needs to be larger than k.

    Returns
    -------
    centers: array, shape(k, n_features)
    """
    random_state = check_random_state(random_state)
    n_samples = X.shape[0]

    if init_size is not None and init_size < n_samples:
        if init_size < k:
            warnings.warn(
                "init_size=%d should be larger than k=%d. "
                "Setting it to 3*k" % (init_size, k),
                RuntimeWarning, stacklevel=2)
            init_size = 3 * k
        init_indices = random_state.random_integers(
            0, n_samples - 1, init_size)
        X = X[init_indices]
        weights = weights[init_indices] if weights is not None else None
        x_squared_norms = x_squared_norms[init_indices]
        n_samples = X.shape[0]
    elif n_samples < k:
            raise ValueError(
                "n_samples=%d should be larger than k=%d" % (n_samples, k))

    if init == 'k-means++':
        assert weights is None and not sphered, "k-means++ initialization is not supported for weighted or sphered data."
        centers = _k_init(X, k, random_state=random_state,
                          x_squared_norms=x_squared_norms)
    elif init == 'random':
        seeds = random_state.permutation(n_samples)[:k]
        centers = X[seeds]
    elif hasattr(init, '__array__'):
        centers = init
    elif callable(init):
        centers = init(X, k, random_state=random_state, weights=weights)
    else:
        raise ValueError("the init parameter for the k-means should "
                         "be 'k-means++' or 'random' or an ndarray, "
                         "'%s' (type '%s') was passed." % (init, type(init)))

    if sp.issparse(centers):
        centers = centers.toarray()

    if len(centers) != k:
        raise ValueError('The shape of the inital centers (%s) '
                         'does not match the number of clusters %i'
                         % (centers.shape, k))

    return centers
Пример #9
0
    def solve(self, X):

        while True:
            if isinstance(self.init, np.ndarray):
                assert X.shape[1] == self.init.shape[1]
                assert self.init.shape[0] == self.k
                assert X.shape[0] > self.init.shape[0]
                init_means = self.init
            elif self.init == 'random':
                init_means = X[self.rng.choice(X.shape[0],
                                               size=self.k,
                                               replace=False)]
            elif self.init == 'k-means++':
                squared_norms = row_norms(X, squared=True)
                init_means = _k_init(X,
                                     n_clusters=self.k,
                                     x_squared_norms=squared_norms,
                                     random_state=self.rng)
            else:
                raise ValueError('Got unrecognized init parameter: {}'.format(
                    self.init))

            self.labels = assign_to_closest(X, init_means)
            self.weights = np.array([
                np.sum(self.labels == i) / X.shape[0] for i in xrange(self.k)
            ])

            if any(self.weights < self.del_treshold) or any(
                    self.weights * X.shape[0] < X.shape[1] + 2):
                if self.allow_lowering_k:
                    self.k -= 1
                    if self.verbose:
                        logger.info(
                            "Failed initialization, decreasing k to {}".format(
                                self.k))
                else:
                    self.seed += 1
                    self.rng = np.random.RandomState(self.seed)
            else:
                break

        self.means = np.array([
            np.mean(X[np.where(self.labels == i)[0]], axis=0)
            for i in xrange(self.k)
        ])
        self.covs = np.array([
            np.cov(X[np.where(self.labels == i)[0]].T, bias=True)
            for i in xrange(self.k)
        ])

        self.removed_clusters = self.k * [False]

        removed_now = False
        it = 0
        energies = []
        while it <= self.max_iter:

            change = False
            update_iter = False

            if removed_now:
                removed_now = False
                update_iter = True

            for idx, x in enumerate(X):

                for candidate_cl in xrange(self.k):

                    current_cl = self.labels[idx]

                    # skip removed cluster or x's current cluster
                    if self.removed_clusters[
                            candidate_cl] or candidate_cl == current_cl:
                        continue

                    current_cost = self.cec_cost(current_cl) + self.cec_cost(
                        candidate_cl)

                    old_weights = self.weights.copy()
                    old_means = self.means.copy()
                    old_covs = self.covs.copy()

                    # calculate evergy for candidte cluster
                    self.calculate_new_params(x, candidate_cl, add=True)
                    cost_added = self.cec_cost(candidate_cl)

                    # calculate energy for current cluster
                    if self.removed_clusters[current_cl]:
                        current_cost = np.inf
                        cost_removed = 0
                    else:
                        self.calculate_new_params(x, current_cl, add=False)
                        cost_removed = self.cec_cost(current_cl)

                    # check if changing x's cluster would result in lower energy
                    if (cost_removed + cost_added) < current_cost:
                        # assign x to new cluster
                        self.labels[idx] = candidate_cl
                        change = True

                        # delete small cluster
                        if not update_iter and not self.removed_clusters[
                                current_cl]:
                            if self.weights[
                                    current_cl] < self.del_treshold or np.sum(
                                        self.labels ==
                                        current_cl) < X.shape[1] + 2:
                                if self.verbose:
                                    logger.info(
                                        "\t Deleting small cluster {}, running updating iteration"
                                        .format(current_cl))
                                self.removed_clusters[current_cl] = True
                                removed_now = True
                                self.max_iter += 1

                    else:
                        self.weights = old_weights
                        self.means = old_means
                        self.covs = old_covs

            # pdb.set_trace()
            if not removed_now:
                energy = np.sum(
                    np.array([
                        self.cec_cost(i) for i in range(self.k)
                        if not self.removed_clusters[i]
                    ]))

                it += 1

                if it == self.max_iter:
                    if self.verbose:
                        logger.warning(
                            "\t Maximum number of iterations reached, final energy: {}"
                            .format(energy))
                    break

                if self.verbose:
                    logger.info("\t Iter {} Enegry {}".format(it, energy))
                    # logger.info("Weights: {}".format(weights))

                if not change:
                    if self.verbose:
                        logger.info("\t No switch in clusters, done")
                    break

                energies.append(energy)

                if len(energies) > 3 and np.std(energies[-3:]) < self.tol:
                    if self.verbose:
                        logger.info(
                            "\t Energy change less than tolerance, done")
                    break

        alive_clusters = np.invert(self.removed_clusters)
        weights = self.weights[alive_clusters]
        means = self.means[alive_clusters]
        covs = self.covs[alive_clusters]

        return energy, self.labels, weights, means, covs
Пример #10
0
 def time_kmeansplusplus(self, *args):
     _k_init(self.X,
             self.n_clusters,
             self.x_squared_norms,
             random_state=np.random.RandomState(0))
Пример #11
0
def kmeans(data,
           K,
           p,
           method="gd",
           eps=1e-4,
           step_size=0.1,
           km_max_step=3000,
           gd_max_step=5000,
           rs=None):

    if method == "gd":
        find_center = gradient_descent
    elif method == "nr":
        find_center = newton_raphson

    squared_norm = (data**2).sum(axis=1)
    centers = _k_init(data, K, squared_norm, np.random.RandomState(rs))
    diff = 100
    prev = -10
    ct = 0

    # all_niters = []
    while diff > eps:
        ct += 1
        # print('iter', ct)
        # begin = time.time()
        if ct > km_max_step:
            print("Kmeans reach max steps", average_mse)
            break

        # compute assignment
        all_dist = []
        for k in range(K):
            dist = distance(centers[k], data, p)
            all_dist.append(dist)

        assign = np.stack(all_dist, axis=0).argmin(axis=0)

        # update centers
        average_mse = 0  # intra-cluster distance, similar to mean square error for euclidean distance
        track_niter = []
        for k in range(K):
            mask = assign == k
            if mask.sum() == 0:  # skip empty cluster
                continue

            d = data[mask]  # data assigned to cluster k

            init_c = d.mean(axis=0)  # use mean to initialize
            new_c, se, niter, diff = find_center(init_c,
                                                 d,
                                                 p,
                                                 eps=eps,
                                                 step_size=step_size,
                                                 max_step=gd_max_step)
            centers[k] = new_c
            track_niter.append(niter)
            # all_niters.append(niter)
            # if np.isnan(new_c).any():
            # print("Nan!!", k, new_c)
            average_mse += se

        average_mse = average_mse / data.shape[0]
        diff = np.abs(average_mse - prev)
        prev = average_mse
        # end = time.time() - begin
        # print("duration", end/60)
        # print("iteration", np.mean(track_niter))
        # print("mse", average_mse)
        # break
        # if ct % 100 == 0:
        # print("iter", ct)

    return centers, average_mse, assign, ct
Пример #12
0
def kmeans_mp(m,
              data,
              K,
              p,
              method="gd",
              eps=1e-4,
              step_size=0.1,
              km_max_step=2000,
              gd_max_step=5000,
              rs=None):
    """
    multiprocess kmeans
    m: number of process to create
    """
    import multiprocessing as mp
    ctx = mp.get_context('fork')
    global MP_X
    MP_X = data

    pool = mp.Pool(processes=m)

    squared_norm = (data**2).sum(axis=1)
    centers = _k_init(data, K, squared_norm, np.random.RandomState(rs))
    diff = 100
    prev = -10
    ct = 0

    # all_niters = []
    kwargs = {"eps": eps, "step_size": step_size, "max_step": gd_max_step}

    # initalize distance for the first run
    dist = []
    for k in range(K):
        d = distance(centers[k], data, p)
        dist.append(d)
    assign = np.stack(dist, axis=0).argmin(axis=0)

    while diff > eps:
        ct += 1
        # print('iter', ct)
        average_mse = 0.0
        begin = time.time()

        # for each cluster, create input to _mp_gd_helper function
        inp = []
        for k in range(K):
            mask = assign == k
            init = data[mask].mean(axis=0)
            d = [method, p, k, init, mask, kwargs]
            inp.append(d)

        # list of output from _mp_gd_helper
        rslt = pool.starmap(_mp_gd_helper, inp)

        dist = []
        niters = []
        mse = 0.0
        for cid, new_c, Xdist, info in rslt:
            centers[cid] = new_c
            dist.append(Xdist)
            niters.append(info[0])
            mse += info[1]

            # if np.isnan(new_c).any():
            # print("Nan!!", k, new_c)

        assign = np.stack(dist, axis=0).argmin(axis=0)

        average_mse = mse / data.shape[
            0]  # average intra-cluster distance, similar to mean square error for euclidean distance
        diff = np.abs(average_mse - prev)
        prev = average_mse

        # print("gd iterations", np.mean(niters))
        # print("sse", average_mse)
        # duration = time.time() - begin
        # print("time %.3fm"%(duration/60))
        break

    pool.close()

    return centers, average_mse, assign
Пример #13
0
x_squared_norms = row_norms(scaled_x_train, squared=True)

if not sp.issparse(scaled_x_train):
        scaled_x_train_mean = scaled_x_train.mean(axis=0)
        scaled_x_train -= scaled_x_train_mean
        
if not sp.issparse(scaled_x_test):
        scaled_x_test_mean = scaled_x_test.mean(axis=0)
        scaled_x_test -= scaled_x_test_mean
        
if not sp.issparse(scaled_data_original):
        scaled_data_original_mean = scaled_data_original.mean(axis=0)
        scaled_data_original -= scaled_data_original_mean
        
#Initializing the centers using k-means++ algorithm implementation of sklearn
centers = _k_init(scaled_x_train, K, random_state=random_state, x_squared_norms=x_squared_norms)


def find_centers(X, n_clusters, centers ):
    
    ''' Function to find centers using lloyd's algorithm.
        paramaeters to be passed: 1. data for which centers are to be found,
                                  2. number of centers & 3. initial centers'''
    
    centers = centers
    K = np.arange(n_clusters)
    i = 0
    
    while True:
        print("Iteration: ", i)
        i = i + 1
Пример #14
0
 def peakmem_kmeansplusplus(self):
     rng = np.random.RandomState(0)
     _k_init(self.X,
             self.n_clusters,
             self.x_squared_norms,
             random_state=rng)
Пример #15
0
def k_means_gpu_sparsity(weight_vector,
                         n_clusters,
                         ratio=0.5,
                         verbosity=0,
                         seed=int(time.time()),
                         gpu_id=0):

    if ratio == 0:

        return k_means_gpu(weight_vector=weight_vector,
                           n_clusters=n_clusters,
                           verbosity=verbosity,
                           seed=seed,
                           gpu_id=gpu_id)

    if ratio == 1:

        if n_clusters == 1:

            mean_sample = np.mean(weight_vector, axis=0)

            weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1))

            return weight_vector

        elif weight_vector.shape[0] == n_clusters:

            return weight_vector

        else:
            weight_vector_1_mean = np.mean(weight_vector, axis=0)

            weight_vector_compress = np.zeros(
                (weight_vector.shape[0], weight_vector.shape[1]),
                dtype=np.float32)
            for v in weight_vector.shape[0]:
                weight_vector_compress[v, :] = weight_vector_1_mean

            return weight_vector_compress

    else:

        if n_clusters == 1:

            mean_sample = np.mean(weight_vector, axis=0)

            weight_vector = np.tile(mean_sample, (weight_vector.shape[0], 1))

            return weight_vector

        elif weight_vector.shape[0] == n_clusters:

            return weight_vector

        elif weight_vector.shape[1] == 1:

            return k_means_sparsity(weight_vector,
                                    n_clusters,
                                    ratio,
                                    seed=seed)

        else:
            num_samples = weight_vector.shape[0]
            mean_sample = np.mean(weight_vector, axis=0)

            center_cluster_index = np.argsort(
                np.linalg.norm(weight_vector - mean_sample,
                               axis=1))[:int(num_samples * ratio)]
            weight_vector_1_mean = np.mean(
                weight_vector[center_cluster_index, :], axis=0)

            remaining_cluster_index = np.asarray([
                i for i in np.arange(num_samples)
                if i not in center_cluster_index
            ])

            weight_vector_train = weight_vector[remaining_cluster_index, :]
            init_centers = k_means_._k_init(X=weight_vector_train,
                                            n_clusters=n_clusters - 1,
                                            x_squared_norms=row_norms(
                                                weight_vector_train,
                                                squared=True),
                                            random_state=RandomState(seed))
            centers, labels = kmeans_cuda(samples=weight_vector_train,
                                          clusters=n_clusters - 1,
                                          init=init_centers,
                                          yinyang_t=0,
                                          seed=seed,
                                          device=gpu_id,
                                          verbosity=verbosity)
            weight_vector_compress = np.zeros(
                (weight_vector.shape[0], weight_vector.shape[1]),
                dtype=np.float32)
            for v in center_cluster_index:
                weight_vector_compress[v, :] = weight_vector_1_mean

            for i, v in enumerate(remaining_cluster_index):
                weight_vector_compress[v, :] = centers[labels[i], :]
            return weight_vector_compress