예제 #1
0
def get_objective_value(X_data, op_centroids, indicator_vector):
    logger.info("Compute objective")
    if paraman["--minibatch"]:
        final_objective_value = compute_objective_by_batch(X_data, op_centroids, indicator_vector, paraman["--minibatch"])
    else:
        final_objective_value = compute_objective(X_data, op_centroids, indicator_vector)

    resprinter.add({
        "final_objective_value": final_objective_value,
    })
    return final_objective_value
예제 #2
0
def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           delta_objective_error_threshold=1e-6,
           hierarchical_init=False):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param delta_objective_error_threshold:
    :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not.
    :return:
    """
    assert K_nb_cluster == initialization.shape[0], "The number of cluster {} is not equal to the number of centroids in the initialization {}.".format(K_nb_cluster, initialization.shape[0])

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    nb_examples = X_data.shape[0]

    logger.info("Initializing Qmeans")

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]
    delta_objective_error_threshold_inner_palm = params_palm4msa["delta_objective_error_threshold"]
    track_objective_palm = params_palm4msa["track_objective"]

    X_centroids_hat = copy.deepcopy(initialization)

    lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1], nb_factors)

    eye_norm = np.sqrt(K_nb_cluster)

    if hierarchical_inside or hierarchical_init:
        _lambda_tmp, op_factors, U_centroids, objective_palm, array_objective_hierarchical= \
            hierarchical_palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                track_objective_palm=track_objective_palm,
                delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm,
                return_objective_function=track_objective_palm)
    else:
        _lambda_tmp, op_factors, U_centroids, objective_palm, nb_iter_palm = \
            palm4msa(
                arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1][
                    "finetune"],
                f_lambda_init=init_lambda * eye_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                track_objective=track_objective_palm,
                delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

    lst_factors = None  # safe assignment for debug

    _lambda = _lambda_tmp / eye_norm

    objective_function = np.ones(nb_iter) * -1
    lst_all_objective_functions_palm = []
    lst_all_objective_functions_palm.append(objective_palm)

    i_iter = 0
    delta_objective_error = np.inf
    while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        lst_factors_ = op_factors.get_list_of_factors()
        op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:])

        ###########################
        # Cluster assignment step #
        ###########################

        indicator_vector, distances = assign_points_to_clusters(X_data, op_centroids, X_norms=X_data_norms)



        #######################
        # Cluster update step #
        #######################

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        # Update centroid location using the newly (it happens in the assess_cluster_integrity function)
        # assigned data point classes
        # and check if all clusters still have points
        # and change the object X_centroids_hat in place if some cluster have lost points (biggest cluster)
        counts, cluster_names_sorted = update_clusters_with_integrity_check(X_data,
                                                                            X_data_norms,
                                                                            X_centroids_hat, # in place changes
                                                                            K_nb_cluster,
                                                                            counts,
                                                                            indicator_vector,
                                                                            distances,
                                                                            cluster_names,
                                                                            cluster_names_sorted)

        #################
        # PALM4MSA step #
        #################

        # create the diagonal of the sqrt of those counts
        diag_counts_sqrt_normalized = csr_matrix(
            (np.sqrt(counts[cluster_names_sorted] / nb_examples),
             (np.arange(K_nb_cluster), np.arange(K_nb_cluster))))
        diag_counts_sqrt = np.sqrt(counts[cluster_names_sorted])

        # set it as first factor
        op_factors.set_factor(0, diag_counts_sqrt_normalized)


        if hierarchical_inside:
            _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \
                hierarchical_palm4msa(
                    arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                    lst_S_init=op_factors.get_list_of_factors(),
                    lst_dct_projection_function=lst_proj_op_by_fac_step,
                    f_lambda_init=_lambda * np.sqrt(nb_examples),
                    nb_iter=nb_iter_palm,
                    update_right_to_left=True,
                    residual_on_right=residual_on_right,
                    return_objective_function=track_objective_palm,
                    track_objective_palm=track_objective_palm,
                    delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm)

        else:
            _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \
                palm4msa(arr_X_target=diag_counts_sqrt[:, None,] *  X_centroids_hat,
                         lst_S_init=op_factors.get_list_of_factors(),
                         nb_factors=op_factors.n_factors,
                         lst_projection_functions=lst_proj_op_by_fac_step[-1][
                             "finetune"],
                         f_lambda_init=_lambda * np.sqrt(nb_examples),
                         nb_iter=nb_iter_palm,
                         update_right_to_left=True,
                         track_objective=track_objective_palm,
                         delta_objective_error_threshold=delta_objective_error_threshold_inner_palm)

        lst_all_objective_functions_palm.append(objective_palm)

        _lambda = _lambda_tmp / np.sqrt(nb_examples)

        objective_function[i_iter] = compute_objective(X_data, op_centroids, indicator_vector)
        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter-1]) / objective_function[i_iter-1]

        # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee

        i_iter += 1

    lst_factors_ = op_factors.get_list_of_factors()
    op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:])

    return objective_function[:i_iter], op_centroids, indicator_vector, lst_all_objective_functions_palm
def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization,
                     batch_size):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param batch_size: The size of each batch.
    :return:
    """

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    # Initialize our centroids by picking random data points
    U_centroids_hat = copy.deepcopy(initialization)
    U_centroids = U_centroids_hat
    full_indicator_vector = np.zeros(X_data.shape[0], dtype=int)
    full_count_vector = np.zeros(K_nb_cluster, dtype=int)
    objective_function = np.empty((nb_iter, ))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while True:
        for i_iter, example_batch_indexes in enumerate(
                DataGenerator(X_data,
                              batch_size=batch_size,
                              return_indexes=True)):
            if not (delta_objective_error > delta_objective_error_threshold):
                logger.info(
                    "not (delta_objective_error {}-{}={} > delta_objective_error_threshold {})"
                    .format(objective_function[i_iter],
                            objective_function[i_iter - 1],
                            delta_objective_error,
                            delta_objective_error_threshold))
                break

            example_batch = X_data[example_batch_indexes]

            logger.info("Iteration Kmeans {}".format(i_iter))

            indicator_vector, distances = assign_points_to_clusters(
                example_batch,
                U_centroids,
                X_norms=X_data_norms[example_batch_indexes])
            full_indicator_vector[example_batch_indexes] = indicator_vector

            cluster_names, counts = np.unique(indicator_vector,
                                              return_counts=True)
            # cluster_names_sorted = np.argsort(cluster_names)
            #
            count_vector = np.zeros(K_nb_cluster, dtype=int)
            count_vector[cluster_names] = counts

            full_count_vector += count_vector
            # previous_full_count_vector = full_count_vector - count_vector

            # Update centroid location using the newly
            # assigned data point classes
            # This way of updating the centroids (centroid index wise) is better than the one proposed in the paper "Web-Scale K-Means Clustering"
            # as the number of update with always be <= batch_size
            for c in range(K_nb_cluster):
                if full_count_vector[c] != 0 and count_vector[c] != 0:
                    U_centroids_hat[c] += (1 / full_count_vector[c]) * np.sum(
                        example_batch[indicator_vector == c] -
                        U_centroids_hat[c],
                        axis=0)
                    # this is exactly equivalent to an update of the mean:
                    # U_centroids_hat[c] = (previous_full_count_vector[c] / full_count_vector[c]) * U_centroids_hat[c] + (1 / full_count_vector[c]) * np.sum(example_batch[indicator_vector == c], axis=0)

            # for i_ex, ex in enumerate(example_batch):
            #     c = indicator_vector[i_ex]
            #     full_count_vector[c] += 1
            #     eta = 1./full_count_vector[c]
            #     U_centroids_hat[c] = (1-eta) * U_centroids_hat[c] + eta * ex

            # counts, cluster_names_sorted = assess_clusters_integrity(X_data,
            #                                                          X_data_norms,
            #                                                          U_centroids_hat,
            #                                                          K_nb_cluster,
            #                                                          counts,
            #                                                          indicator_vector,
            #                                                          distances,
            #                                                          cluster_names,
            #                                                          cluster_names_sorted)

            # check if all clusters still have points
            # for c in range(K_nb_cluster):
            #     biggest_cluster_index = np.argmax(counts)  # type: int
            #     biggest_cluster = cluster_names[biggest_cluster_index]
            #     biggest_cluster_data = X_data[indicator_vector == biggest_cluster]
            #
            #     cluster_data = X_data[indicator_vector == c]
            #     if len(cluster_data) == 0:
            #         logger.warning("cluster has lost data, add new cluster. cluster idx: {}".format(c))
            #         U_centroids_hat[c] = biggest_cluster_data[np.random.randint(len(biggest_cluster_data))].reshape(1, -1)
            #         counts = list(counts)
            #         counts[biggest_cluster_index] -= 1
            #         counts.append(1)
            #         counts = np.array(counts)
            #         cluster_names_sorted = list(cluster_names_sorted)
            #         cluster_names_sorted.append(c)
            #         cluster_names_sorted = np.array(cluster_names_sorted)
            #     else:
            #         U_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

            U_centroids = U_centroids_hat

            objective_function[i_iter, ] = compute_objective(
                X_data, U_centroids, full_indicator_vector)

            if i_iter >= 1:
                delta_objective_error = np.abs(
                    objective_function[i_iter] - objective_function[i_iter - 1]
                ) / objective_function[
                    i_iter -
                    1]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

            i_iter += 1
        else:
            continue
        break

    return objective_function[:i_iter], U_centroids, indicator_vector
예제 #4
0
def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           graphical_display=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param graphical_display: Tell the algorithm to display the results.
    :return:
    """

    assert K_nb_cluster == initialization.shape[0]

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]

    X_centroids_hat = copy.deepcopy(initialization)
    min_K_d = min(X_centroids_hat.shape)

    lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)]

    eye_norm = np.sqrt(K_nb_cluster)
    lst_factors[0] = np.eye(K_nb_cluster) / eye_norm
    lst_factors[1] = np.eye(K_nb_cluster, min_K_d)
    lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1]))

    if graphical_display:
        lst_factors_init = copy.deepcopy(lst_factors)

    _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
        arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
        lst_S_init=lst_factors,
        lst_dct_projection_function=lst_proj_op_by_fac_step,
        f_lambda_init=init_lambda * eye_norm,
        nb_iter=nb_iter_palm,
        update_right_to_left=True,
        residual_on_right=residual_on_right,
        graphical_display=False)

    _lambda = _lambda_tmp / eye_norm

    if graphical_display:
        if hierarchical_inside:
            plt.figure()
            plt.yscale("log")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                        objective_palm[:, 0],
                        marker="x",
                        label="before split")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                        objective_palm[:, 1],
                        marker="x",
                        label="between")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                        objective_palm[:, 2],
                        marker="x",
                        label="after finetune")
            plt.plot(np.arange(len(objective_palm) * 3),
                     objective_palm.flatten(),
                     color="k")
            plt.legend()
            plt.show()

        visual_evaluation_palm4msa(
            np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init,
            lst_factors, _lambda * multi_dot(lst_factors))

    objective_function = np.empty((nb_iter, 2))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while (i_iter <= 1) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        U_centroids = _lambda * multi_dot(lst_factors[1:])

        if i_iter > 0:
            objective_function[i_iter,
                               0] = compute_objective(X_data, U_centroids,
                                                      indicator_vector)

        # Assign all points to the nearest centroid
        # first get distance from all points to all centroids
        distances = get_distances(X_data,
                                  U_centroids,
                                  precomputed_data_points_norm=X_data_norms)
        # then, Determine class membership of each point
        # by picking the closest centroid
        indicator_vector = np.argmin(distances, axis=1)

        objective_function[i_iter,
                           1] = compute_objective(X_data, U_centroids,
                                                  indicator_vector)

        # Update centroid location using the newly
        # assigned data point classes
        for c in range(K_nb_cluster):
            X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        if len(counts) < K_nb_cluster:
            raise ValueError(
                "Some clusters have no point. Aborting iteration {}".format(
                    i_iter))

        diag_counts_sqrt = np.diag(np.sqrt(
            counts[cluster_names_sorted]))  # todo use sparse matrix object
        diag_counts_sqrt_norm = np.linalg.norm(
            diag_counts_sqrt
        )  # todo analytic sqrt(n) instead of cumputing it with norm
        diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm
        # set it as first factor
        lst_factors[0] = diag_counts_sqrt_normalized

        if graphical_display:
            lst_factors_init = copy.deepcopy(lst_factors)

        if hierarchical_inside:
            _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                # f_lambda_init=_lambda,
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                graphical_display=False)

            loss_palm_before = objective_palm[0, 0]
            loss_palm_after = objective_palm[-1, -1]

        else:
            _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1]
                ["finetune"],
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                graphical_display=False)

            loss_palm_before = objective_palm[0, -1]
            loss_palm_after = objective_palm[-1, -1]

        logger.debug("Loss palm before: {}".format(loss_palm_before))
        logger.debug("Loss palm after: {}".format(loss_palm_after))

        if graphical_display:
            if hierarchical_inside:
                plt.figure()
                plt.yscale("log")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                            objective_palm[:, 0],
                            marker="x",
                            label="before split")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                            objective_palm[:, 1],
                            marker="x",
                            label="between")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                            objective_palm[:, 2],
                            marker="x",
                            label="after finetune")
                plt.plot(np.arange(len(objective_palm) * 3),
                         objective_palm.flatten(),
                         color="k")
                plt.legend()
                plt.show()

            visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat,
                                       lst_factors_init, lst_factors,
                                       _lambda_tmp * multi_dot(lst_factors))

        _lambda = _lambda_tmp / diag_counts_sqrt_norm

        logger.debug("Returned loss (with diag) palm: {}".format(
            objective_palm[-1, 0]))

        if i_iter >= 2:
            delta_objective_error = np.abs(
                objective_function[i_iter, 0] -
                objective_function[i_iter - 1, 0]
            ) / objective_function[
                i_iter - 1,
                0]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    U_centroids = _lambda * multi_dot(lst_factors[1:])
    distances = get_distances(X_data,
                              U_centroids,
                              precomputed_data_points_norm=X_data_norms)
    indicator_vector = np.argmin(distances, axis=1)

    return objective_function[:i_iter], U_centroids, indicator_vector
예제 #5
0
파일: kmeans.py 프로젝트: lucgiffon/qkmeans
def kmeans(X_data,
           K_nb_cluster,
           nb_iter,
           initialization,
           delta_objective_error_threshold=1e-6,
           proj_l1=False,
           _lambda=None,
           epsilon=None):
    """

    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param initialization: The (K, d) matrix of centroids at initialization.
    :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value.
    :return:
    """

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    # Initialize our centroids by picking random data points
    U_centroids_hat = copy.deepcopy(initialization)
    U_centroids = U_centroids_hat

    objective_function = np.empty((nb_iter, ))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error = np.inf
    while (i_iter == 0) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Kmeans {}".format(i_iter))

        indicator_vector, distances = assign_points_to_clusters(
            X_data, U_centroids, X_norms=X_data_norms)

        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        # Update centroid location using the new indicator vector
        counts, cluster_names_sorted = update_clusters_with_integrity_check(
            X_data, X_data_norms, U_centroids_hat, K_nb_cluster, counts,
            indicator_vector, distances, cluster_names, cluster_names_sorted)

        U_centroids = U_centroids_hat

        if proj_l1:
            if _lambda is None or epsilon is None:
                raise ValueError(
                    "epsilon and lambda must be set if proj_l1 is True")
            for i_centroid, centroid in enumerate(U_centroids):
                U_centroids[i_centroid, :] = proj_onto_l1_ball(
                    _lambda=_lambda, epsilon_tol=epsilon, vec=centroid)

        objective_function[i_iter, ] = compute_objective(
            X_data, U_centroids, indicator_vector)

        if i_iter >= 1:
            delta_objective_error = np.abs(objective_function[i_iter] -
                                           objective_function[i_iter - 1]
                                           ) / objective_function[i_iter - 1]

        i_iter += 1

    return objective_function[:i_iter], U_centroids, indicator_vector