def make_batch_assignation_evaluation(X, centroids):
    """
    Assign `size_batch` random samples of `X` to some of the centroids.
    All the samples are assigned at the same time using a matrix-vector multiplication.
    Time is recorded.

    :param X: The input data from which to take the samples.
    :param centroids: The centroids to which to assign the samples (must be of same dimension than `X`)
    :param size_batch: The number of data points to assign

    :return: None
    """
    size_batch = paraman["--batch-assignation-time"]
    if size_batch > X.shape[0]:
        logger.warning(
            "Batch size for batch assignation evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(size_batch, X.shape[0]))
        size_batch = X.shape[0]
        paraman["--batch-assignation-time"] = size_batch

    # precomputed_centroid_norms = get_squared_froebenius_norm(centroids)
    precomputed_centroid_norms = None
    indexes_batch = np.random.permutation(X.shape[0])[:size_batch]
    start_time = time.time()
    get_distances(X[indexes_batch],
                  centroids,
                  precomputed_centroids_norm=precomputed_centroid_norms)
    stop_time = time.time()

    resprinter.add({
        "batch_assignation_mean_time": (stop_time - start_time) / size_batch,
    })
예제 #2
0
def make_assignation_evaluation(X, centroids):
    nb_eval = paraman["--assignation-time"]
    if nb_eval > X.shape[0]:
        logger.warning(
            "Batch size for assignation evaluation is bigger than data size. {} > {}. Using "
            "data size instead.".format(nb_eval, X.shape[0]))
        nb_eval = X.shape[0]
        paraman["--assignation-time"] = nb_eval

    times = []
    precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(
        centroids)
    for i in np.random.permutation(X.shape[0])[:nb_eval]:
        start_time = time.time()
        get_distances(X[i].reshape(1, -1),
                      centroids,
                      precomputed_centroids_norm=precomputed_centroid_norms)
        stop_time = time.time()
        times.append(stop_time - start_time)

    mean_time = np.mean(times)
    std_time = np.std(times)

    resprinter.add({
        "assignation_mean_time": mean_time,
        "assignation_std_time": std_time
    })
    def kmean_tree_evaluation():

        lst_clf_by_cluster = [
            KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(
                x_train[indicator_vector == i], y_train[indicator_vector == i])
            for i in range(U_centroids.shape[0])
        ]

        start_inference_time = time.time()
        distances = get_distances(x_test, U_centroids)
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1,
                                                                        -1))[0]

        stop_inference_time = time.time()
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time
    def kmean_tree_evaluation():
        """
        Do the K-means partitioning version of nearest neighbor?=.

        :return:
        """
        # for each cluster, there is a sub nearest neighbor classifier for points in that cluster.
        lst_clf_by_cluster = [KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0])]
        log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation")
        precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids)
        # precomputed_centroid_norms = None
        start_inference_time = time.process_time()
        distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms)
        stop_get_distances_time = time.process_time()
        get_distance_time = stop_get_distances_time - start_inference_time
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            # get the cluster to which belongs this data point and call the associated nearest neighbor classifier
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.process_time()
        log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation")
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_get_distance_time": get_distance_time / x_test.shape[0],
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time
예제 #5
0
    def kmean_tree_evaluation():
        """
        Do the K-means partitioning version of nearest neighbor?=.

        :return:
        """
        # for each cluster, there is a sub nearest neighbor classifier for points in that cluster.
        lst_clf_by_cluster = [
            KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(
                x_train[indicator_vector == i], y_train[indicator_vector == i])
            for i in range(U_centroids.shape[0])
        ]

        start_inference_time = time.time()
        distances = get_distances(x_test, U_centroids)
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            # get the cluster to which belongs this data point and call the associated nearest neighbor classifier
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1,
                                                                        -1))[0]
        stop_inference_time = time.time()
        inference_time = (stop_inference_time - start_inference_time)

        accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time
    def kmean_tree_evaluation():
        """
        Do the K-means partitioning version of nearest neighbor?=.

        :return:
        """
        # for each cluster, there is a sub nearest neighbor classifier for points in that cluster.
        lst_clf_by_cluster = []
        indices_no_train_obs_in_cluster = []
        for i in range(U_centroids.shape[0]):
            try:
                lst_clf_by_cluster.append(KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i]))
            except ValueError:
                indices_no_train_obs_in_cluster.append(i)
                lst_clf_by_cluster.append(None)

        # lst_clf_by_cluster = [ for i in range(landmarks.shape[0])]
        log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation")
        precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids)
        # precomputed_centroid_norms = None
        start_inference_time = time.process_time()
        distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms)
        stop_get_distances_time = time.process_time()
        get_distance_time = stop_get_distances_time - start_inference_time
        if len(indices_no_train_obs_in_cluster):
            distances[:, np.array(indices_no_train_obs_in_cluster)] = np.inf
        indicator_vector_test = np.argmin(distances, axis=1)
        predictions = np.empty_like(y_test)
        for obs_idx, obs_test in enumerate(x_test):
            # get the cluster to which belongs this data point and call the associated nearest neighbor classifier
            idx_cluster = indicator_vector_test[obs_idx]
            clf_cluster = lst_clf_by_cluster[idx_cluster]
            predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0]
        stop_inference_time = time.process_time()
        log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation")
        inference_time = (stop_inference_time - start_inference_time)

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy = f1
        else:
            accuracy = np.sum(predictions == y_test) / y_test.shape[0]

        results_1nn = {
            "1nn_kmean_inference_time": inference_time,
            "1nn_get_distance_time": get_distance_time / x_test.shape[0],
            "1nn_kmean_accuracy": accuracy
        }
        resprinter.add(results_1nn)
        return inference_time
def make_assignation_evaluation(X, centroids):
    nb_eval = 100
    times = []
    precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(
        centroids)
    for i in np.random.permutation(X.shape[0])[:nb_eval]:
        start_time = time.time()
        get_distances(X[i].reshape(1, -1),
                      centroids,
                      precomputed_centroids_norm=precomputed_centroid_norms)
        stop_time = time.time()
        times.append(stop_time - start_time)

    mean_time = np.mean(times)
    std_time = np.std(times)

    resprinter.add({
        "assignation_mean_time": mean_time,
        "assignation_std_time": std_time
    })
예제 #8
0
        if paraman["kmeans"]:
            U_final, indicator_vector_final = main_kmeans(
                dataset["x_train"], U_init)

            log_memory_usage("Memory after kmeans")

            dct_nb_param = {"nb_param_centroids": U_final.size}
            if paraman["palm"]:
                if paraman["--nb-factors"] is None:
                    paraman["--nb-factors"] = int(np.log2(min(U_init.shape)))
                paraman["--residual-on-right"] = True if U_init.shape[
                    1] >= U_init.shape[0] else False

                U_final = process_palm_on_top_of_kmeans(U_final)
                distances = get_distances(dataset["x_train"], U_final)
                indicator_vector_final = np.argmin(distances, axis=1)
                dct_nb_param = {"nb_param_centroids": U_final.get_nb_param()}

        elif paraman["qmeans"]:
            # paraman_q = ParameterManagerQmeans(arguments)
            # paraman.update(paraman_q)
            if paraman["--nb-factors"] is None:
                paraman["--nb-factors"] = int(np.log2(min(U_init.shape)))
            paraman["--residual-on-right"] = True if U_init.shape[
                1] >= U_init.shape[0] else False
            U_final, indicator_vector_final = main_qmeans(
                dataset["x_train"], U_init)

            log_memory_usage("Memory after qmeans")
예제 #9
0
def qmeans(X_data: np.ndarray,
           K_nb_cluster: int,
           nb_iter: int,
           nb_factors: int,
           params_palm4msa: dict,
           initialization: np.ndarray,
           hierarchical_inside=False,
           graphical_display=False):
    """
    :param X_data: The data matrix of n examples in dimensions d in shape (n, d).
    :param K_nb_cluster: The number of clusters to look for.
    :param nb_iter: The maximum number of iteration.
    :param nb_factors: The number of factors for the decomposition.
    :param initialization: The initial matrix of centroids not yet factorized.
    :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm.
    :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used.
    :param graphical_display: Tell the algorithm to display the results.
    :return:
    """

    assert K_nb_cluster == initialization.shape[0]

    X_data_norms = get_squared_froebenius_norm_line_wise(X_data)

    init_lambda = params_palm4msa["init_lambda"]
    nb_iter_palm = params_palm4msa["nb_iter"]
    lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"]
    residual_on_right = params_palm4msa["residual_on_right"]

    X_centroids_hat = copy.deepcopy(initialization)
    min_K_d = min(X_centroids_hat.shape)

    lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)]

    eye_norm = np.sqrt(K_nb_cluster)
    lst_factors[0] = np.eye(K_nb_cluster) / eye_norm
    lst_factors[1] = np.eye(K_nb_cluster, min_K_d)
    lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1]))

    if graphical_display:
        lst_factors_init = copy.deepcopy(lst_factors)

    _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
        arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat,
        lst_S_init=lst_factors,
        lst_dct_projection_function=lst_proj_op_by_fac_step,
        f_lambda_init=init_lambda * eye_norm,
        nb_iter=nb_iter_palm,
        update_right_to_left=True,
        residual_on_right=residual_on_right,
        graphical_display=False)

    _lambda = _lambda_tmp / eye_norm

    if graphical_display:
        if hierarchical_inside:
            plt.figure()
            plt.yscale("log")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                        objective_palm[:, 0],
                        marker="x",
                        label="before split")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                        objective_palm[:, 1],
                        marker="x",
                        label="between")
            plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                        objective_palm[:, 2],
                        marker="x",
                        label="after finetune")
            plt.plot(np.arange(len(objective_palm) * 3),
                     objective_palm.flatten(),
                     color="k")
            plt.legend()
            plt.show()

        visual_evaluation_palm4msa(
            np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init,
            lst_factors, _lambda * multi_dot(lst_factors))

    objective_function = np.empty((nb_iter, 2))

    # Loop for the maximum number of iterations
    i_iter = 0
    delta_objective_error_threshold = 1e-6
    delta_objective_error = np.inf
    while (i_iter <= 1) or (
        (i_iter < nb_iter) and
        (delta_objective_error > delta_objective_error_threshold)):

        logger.info("Iteration Qmeans {}".format(i_iter))

        U_centroids = _lambda * multi_dot(lst_factors[1:])

        if i_iter > 0:
            objective_function[i_iter,
                               0] = compute_objective(X_data, U_centroids,
                                                      indicator_vector)

        # Assign all points to the nearest centroid
        # first get distance from all points to all centroids
        distances = get_distances(X_data,
                                  U_centroids,
                                  precomputed_data_points_norm=X_data_norms)
        # then, Determine class membership of each point
        # by picking the closest centroid
        indicator_vector = np.argmin(distances, axis=1)

        objective_function[i_iter,
                           1] = compute_objective(X_data, U_centroids,
                                                  indicator_vector)

        # Update centroid location using the newly
        # assigned data point classes
        for c in range(K_nb_cluster):
            X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0)

        # get the number of observation in each cluster
        cluster_names, counts = np.unique(indicator_vector, return_counts=True)
        cluster_names_sorted = np.argsort(cluster_names)

        if len(counts) < K_nb_cluster:
            raise ValueError(
                "Some clusters have no point. Aborting iteration {}".format(
                    i_iter))

        diag_counts_sqrt = np.diag(np.sqrt(
            counts[cluster_names_sorted]))  # todo use sparse matrix object
        diag_counts_sqrt_norm = np.linalg.norm(
            diag_counts_sqrt
        )  # todo analytic sqrt(n) instead of cumputing it with norm
        diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm
        # set it as first factor
        lst_factors[0] = diag_counts_sqrt_normalized

        if graphical_display:
            lst_factors_init = copy.deepcopy(lst_factors)

        if hierarchical_inside:
            _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                lst_dct_projection_function=lst_proj_op_by_fac_step,
                # f_lambda_init=_lambda,
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                residual_on_right=residual_on_right,
                graphical_display=False)

            loss_palm_before = objective_palm[0, 0]
            loss_palm_after = objective_palm[-1, -1]

        else:
            _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa(
                arr_X_target=diag_counts_sqrt @ X_centroids_hat,
                lst_S_init=lst_factors,
                nb_factors=len(lst_factors),
                lst_projection_functions=lst_proj_op_by_fac_step[-1]
                ["finetune"],
                f_lambda_init=_lambda * diag_counts_sqrt_norm,
                nb_iter=nb_iter_palm,
                update_right_to_left=True,
                graphical_display=False)

            loss_palm_before = objective_palm[0, -1]
            loss_palm_after = objective_palm[-1, -1]

        logger.debug("Loss palm before: {}".format(loss_palm_before))
        logger.debug("Loss palm after: {}".format(loss_palm_after))

        if graphical_display:
            if hierarchical_inside:
                plt.figure()
                plt.yscale("log")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3),
                            objective_palm[:, 0],
                            marker="x",
                            label="before split")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1,
                            objective_palm[:, 1],
                            marker="x",
                            label="between")
                plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2,
                            objective_palm[:, 2],
                            marker="x",
                            label="after finetune")
                plt.plot(np.arange(len(objective_palm) * 3),
                         objective_palm.flatten(),
                         color="k")
                plt.legend()
                plt.show()

            visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat,
                                       lst_factors_init, lst_factors,
                                       _lambda_tmp * multi_dot(lst_factors))

        _lambda = _lambda_tmp / diag_counts_sqrt_norm

        logger.debug("Returned loss (with diag) palm: {}".format(
            objective_palm[-1, 0]))

        if i_iter >= 2:
            delta_objective_error = np.abs(
                objective_function[i_iter, 0] -
                objective_function[i_iter - 1, 0]
            ) / objective_function[
                i_iter - 1,
                0]  # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée

        i_iter += 1

    U_centroids = _lambda * multi_dot(lst_factors[1:])
    distances = get_distances(X_data,
                              U_centroids,
                              precomputed_data_points_norm=X_data_norms)
    indicator_vector = np.argmin(distances, axis=1)

    return objective_function[:i_iter], U_centroids, indicator_vector