Пример #1
0
def test_compute_memberships_condensed(data, distance_matrix, distance_matrix_condensed, components, fuzzifier):
    medoids0_idx = random_choice_idx(data, components=components)

    memberships_condensed = _compute_memberships_condensed(distance_matrix_condensed, medoids0_idx, fuzzifier,
                                                           n=data.shape[0])
    memberships_square = _compute_memberships_square(distance_matrix, medoids0_idx, fuzzifier)

    assert np.all(np.isclose(memberships_condensed.sum(axis=1), np.ones((1, data.shape[0]))))
    assert np.all(np.isclose(memberships_square, memberships_condensed))
Пример #2
0
def test_compute_medoids_square(data, distance_matrix, components, fuzzifier):
    medoids0 = random_choice_idx(data, components=components)

    true_memberships = __compute_memberships_square(distance_matrix, medoids0, fuzzifier)

    true_medoids = __compute_medoids_square(distance_matrix, true_memberships, fuzzifier)
    medoids = _compute_medoids_square(distance_matrix, true_memberships, fuzzifier)

    assert np.all(np.isclose(medoids, true_medoids))
Пример #3
0
def test_compute_memberships_square(data, distance_matrix, components, fuzzifier):
    medoids0 = random_choice_idx(data, components=components)

    true_memberships = __compute_memberships_square(distance_matrix, medoids0, fuzzifier)
    memberships = _compute_memberships_square(distance_matrix, medoids0, fuzzifier)

    assert np.all(np.isclose(true_memberships.sum(axis=1), np.ones((1, data.shape[0]))))
    assert np.all(np.isclose(memberships.sum(axis=1), np.ones((1, data.shape[0]))))
    assert np.all(np.isclose(memberships, true_memberships))
Пример #4
0
def test_compute_loss_condensed(data, distance_matrix, distance_matrix_condensed, components, fuzzifier):
    medoids0 = random_choice_idx(data, components=components)
    true_memberships = _compute_memberships_square(distance_matrix, medoids0, fuzzifier)
    true_medoids = _compute_medoids_square(distance_matrix, true_memberships, fuzzifier)

    true_loss = _compute_loss_square(distance_matrix, true_medoids, true_memberships, fuzzifier)
    loss = _compute_loss_condensed(distance_matrix_condensed, true_medoids, true_memberships, fuzzifier,
                                   n=data.shape[0])

    assert np.isclose(true_loss, loss)
Пример #5
0
def test_compute_medoids_condensed(data, distance_matrix, distance_matrix_condensed, components, fuzzifier):
    medoids0 = random_choice_idx(data, components=components)
    true_memberships = _compute_memberships_square(distance_matrix, medoids0, fuzzifier)

    true_medoids = _compute_medoids_square(distance_matrix, true_memberships, fuzzifier)
    medoids = _compute_medoids_condensed(distance_matrix_condensed, true_memberships, fuzzifier, n=data.shape[0])

    print(true_medoids.shape)
    print(medoids.shape)
    exit(0)

    assert np.all(np.isclose(medoids, true_medoids))
Пример #6
0
def minibatch_kmeans(data,
                     components=10,
                     eps=1e-4,
                     max_iter=1000,
                     batch_size=None,
                     weights=None,
                     initialization_method="random_choice",
                     empty_clusters_method="nothing",
                     centroids=None):
    """ Performs the k-means clustering algorithm on a dataset.

    :param data: The dataset into which the clustering will be performed. The dataset must be 2D np.array with rows as
    examples and columns as features.
    :param components: The number of components (clusters) wanted.
    :param eps: Criterion used to define convergence. If the absolute differences between two consecutive losses is
    lower than `eps`, the clustering stop.
    :param max_iter: Criterion used to stop the clustering if the number of iterations exceeds `max_iter`.
    :param batch_size: Number of samples drawn at each iterations. if `None`, is set to 10% of the dataset.
    :param weights: Weighting of each features during clustering. Must be an Iterable of weights with the same size as
    the number of features.
    :param initialization_method: Method used to initialise the centroids. Can take one of the following values :
    * "random_uniform" or "uniform", samples values between the min and max across each dimension.
    * "random_gaussian" or "gaussian", samples values from a gaussian with the same mean and std as each data's
    dimension.
    * "random_choice" or "choice", samples random examples from the data without replacement.
    * "central_dissimilar_medoids", sample the first medoid as the most central point of the dataset, then sample all
    successive medoids as the most dissimilar to all medoids that have already been picked.
    * "central_dissimilar_random_medoids", same as "central_dissimilar_medoids", but the first medoid is sampled
    randomly.
    :param empty_clusters_method: Method used at each iteration to handle empty clusters. Can take one of the following
    values :
    * "nothing", do absolutely nothing and ignore empty clusters.
    * "random_example", assign a random example to all empty clusters.
    * "furthest_example_from_its_centroid", assign the furthest example from its centroid to each empty cluster.
    :param centroids: Initials centroids to use instead of randomly initialize them.
    :return: A tuple containing :
    * The memberships matrix.
    * The centroids matrix.
    * An array with all losses at each iteration.
    """
    assert len(data.shape) == 2, "The data must be a 2D array"
    assert data.shape[0] > 0, "The data must have at least one example"
    assert data.shape[1] > 0, "The data must have at least one feature"
    assert 1 <= components <= data.shape[
        0], "The number of components wanted must be between 1 and %s" % data.shape[
            0]
    assert 0 <= max_iter, "The number of max iterations must be positive"
    assert (batch_size is None) or (is_integer(batch_size) and (0 < batch_size <= data.shape[0])),\
        "The batch_size provided must be an integer between 1 and %s. Given size : %s" %\
        (data.shape[0], batch_size)
    assert (weights is None) or (len(weights) == data.shape[1]),\
        "The number of weights given must be the same as the number of features. Expected size : %s, given size : %s" %\
        (data.shape[1], len(weights))
    assert (centroids is None) or (centroids.shape == (components, data.shape[1])), \
        "The given centroids do not have a correct shape. Expected shape : {}, given shape : {}".format(
            (components, data.shape[1]), centroids.shape
        )

    if batch_size is None:
        # If batch_size is not provided, set to 10% of the dataset by default
        batch_size = int(data.shape[0] * 0.1)

    if weights is not None:
        # Applying weighted euclidean distance is equivalent to applying traditional euclidean distance into data
        # weighted by the square root of the weights, see [5]
        data = data * np.sqrt(weights)

    # Initialisation
    if centroids is None:
        centroids = cluster_initialization(data,
                                           components,
                                           strategy=initialization_method,
                                           need_idx=False)

    with tqdm(total=max_iter, bar_format=_FORMAT_PROGRESS_BAR) as progress_bar:
        best_memberships = None
        best_centroids = None
        best_loss = np.inf

        memberships = None
        losses = []
        current_iter = 0
        while (current_iter < max_iter):  # and \
            #((current_iter < 2) or (abs(losses[-2] - losses[-1]) > eps)):
            # Draw `batch_size` random samples
            minibatch_idx = random_choice_idx(data, batch_size)
            minibatch = data[minibatch_idx, :]

            memberships = _optim_memberships(minibatch, centroids)
            handle_empty_clusters(minibatch,
                                  centroids,
                                  memberships,
                                  strategy=empty_clusters_method)

            centroids = _optim_centroids(minibatch, memberships)

            loss = _compute_loss(data, centroids)
            losses.append(loss)
            if loss < best_loss:
                best_loss = loss
                best_memberships = memberships
                best_centroids = centroids

            # Update the progress bar
            current_iter += 1
            progress_bar.update()
            progress_bar.set_postfix({
                "Loss": "{0:.6f}".format(loss),
                "best_loss": "{0:.6f}".format(best_loss)
            })

    return {
        "memberships": best_memberships,
        "clusters_center": best_centroids,
        "losses": np.array(losses),
        "affectations": best_memberships.argmax(axis=1),
        "extended_time": progress_bar.last_print_t - progress_bar.start_t,
    }