def make_batch_assignation_evaluation(X, centroids): """ Assign `size_batch` random samples of `X` to some of the centroids. All the samples are assigned at the same time using a matrix-vector multiplication. Time is recorded. :param X: The input data from which to take the samples. :param centroids: The centroids to which to assign the samples (must be of same dimension than `X`) :param size_batch: The number of data points to assign :return: None """ size_batch = paraman["--batch-assignation-time"] if size_batch > X.shape[0]: logger.warning( "Batch size for batch assignation evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(size_batch, X.shape[0])) size_batch = X.shape[0] paraman["--batch-assignation-time"] = size_batch # precomputed_centroid_norms = get_squared_froebenius_norm(centroids) precomputed_centroid_norms = None indexes_batch = np.random.permutation(X.shape[0])[:size_batch] start_time = time.time() get_distances(X[indexes_batch], centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_time = time.time() resprinter.add({ "batch_assignation_mean_time": (stop_time - start_time) / size_batch, })
def make_assignation_evaluation(X, centroids): nb_eval = paraman["--assignation-time"] if nb_eval > X.shape[0]: logger.warning( "Batch size for assignation evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(nb_eval, X.shape[0])) nb_eval = X.shape[0] paraman["--assignation-time"] = nb_eval times = [] precomputed_centroid_norms = get_squared_froebenius_norm_line_wise( centroids) for i in np.random.permutation(X.shape[0])[:nb_eval]: start_time = time.time() get_distances(X[i].reshape(1, -1), centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_time = time.time() times.append(stop_time - start_time) mean_time = np.mean(times) std_time = np.std(times) resprinter.add({ "assignation_mean_time": mean_time, "assignation_std_time": std_time })
def kmean_tree_evaluation(): lst_clf_by_cluster = [ KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit( x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0]) ] start_inference_time = time.time() distances = get_distances(x_test, U_centroids) indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time
def kmean_tree_evaluation(): """ Do the K-means partitioning version of nearest neighbor?=. :return: """ # for each cluster, there is a sub nearest neighbor classifier for points in that cluster. lst_clf_by_cluster = [KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0])] log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation") precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids) # precomputed_centroid_norms = None start_inference_time = time.process_time() distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_get_distances_time = time.process_time() get_distance_time = stop_get_distances_time - start_inference_time indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): # get the cluster to which belongs this data point and call the associated nearest neighbor classifier idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.process_time() log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation") inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_get_distance_time": get_distance_time / x_test.shape[0], "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time
def kmean_tree_evaluation(): """ Do the K-means partitioning version of nearest neighbor?=. :return: """ # for each cluster, there is a sub nearest neighbor classifier for points in that cluster. lst_clf_by_cluster = [ KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit( x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0]) ] start_inference_time = time.time() distances = get_distances(x_test, U_centroids) indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): # get the cluster to which belongs this data point and call the associated nearest neighbor classifier idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time
def kmean_tree_evaluation(): """ Do the K-means partitioning version of nearest neighbor?=. :return: """ # for each cluster, there is a sub nearest neighbor classifier for points in that cluster. lst_clf_by_cluster = [] indices_no_train_obs_in_cluster = [] for i in range(U_centroids.shape[0]): try: lst_clf_by_cluster.append(KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i])) except ValueError: indices_no_train_obs_in_cluster.append(i) lst_clf_by_cluster.append(None) # lst_clf_by_cluster = [ for i in range(landmarks.shape[0])] log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation") precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids) # precomputed_centroid_norms = None start_inference_time = time.process_time() distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_get_distances_time = time.process_time() get_distance_time = stop_get_distances_time - start_inference_time if len(indices_no_train_obs_in_cluster): distances[:, np.array(indices_no_train_obs_in_cluster)] = np.inf indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): # get the cluster to which belongs this data point and call the associated nearest neighbor classifier idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.process_time() log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation") inference_time = (stop_inference_time - start_inference_time) if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy = f1 else: accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_get_distance_time": get_distance_time / x_test.shape[0], "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time
def make_assignation_evaluation(X, centroids): nb_eval = 100 times = [] precomputed_centroid_norms = get_squared_froebenius_norm_line_wise( centroids) for i in np.random.permutation(X.shape[0])[:nb_eval]: start_time = time.time() get_distances(X[i].reshape(1, -1), centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_time = time.time() times.append(stop_time - start_time) mean_time = np.mean(times) std_time = np.std(times) resprinter.add({ "assignation_mean_time": mean_time, "assignation_std_time": std_time })
if paraman["kmeans"]: U_final, indicator_vector_final = main_kmeans( dataset["x_train"], U_init) log_memory_usage("Memory after kmeans") dct_nb_param = {"nb_param_centroids": U_final.size} if paraman["palm"]: if paraman["--nb-factors"] is None: paraman["--nb-factors"] = int(np.log2(min(U_init.shape))) paraman["--residual-on-right"] = True if U_init.shape[ 1] >= U_init.shape[0] else False U_final = process_palm_on_top_of_kmeans(U_final) distances = get_distances(dataset["x_train"], U_final) indicator_vector_final = np.argmin(distances, axis=1) dct_nb_param = {"nb_param_centroids": U_final.get_nb_param()} elif paraman["qmeans"]: # paraman_q = ParameterManagerQmeans(arguments) # paraman.update(paraman_q) if paraman["--nb-factors"] is None: paraman["--nb-factors"] = int(np.log2(min(U_init.shape))) paraman["--residual-on-right"] = True if U_init.shape[ 1] >= U_init.shape[0] else False U_final, indicator_vector_final = main_qmeans( dataset["x_train"], U_init) log_memory_usage("Memory after qmeans")
def qmeans(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, hierarchical_inside=False, graphical_display=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param graphical_display: Tell the algorithm to display the results. :return: """ assert K_nb_cluster == initialization.shape[0] X_data_norms = get_squared_froebenius_norm_line_wise(X_data) init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] X_centroids_hat = copy.deepcopy(initialization) min_K_d = min(X_centroids_hat.shape) lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)] eye_norm = np.sqrt(K_nb_cluster) lst_factors[0] = np.eye(K_nb_cluster) / eye_norm lst_factors[1] = np.eye(K_nb_cluster, min_K_d) lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1])) if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) _lambda = _lambda_tmp / eye_norm if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa( np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init, lst_factors, _lambda * multi_dot(lst_factors)) objective_function = np.empty((nb_iter, 2)) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error_threshold = 1e-6 delta_objective_error = np.inf while (i_iter <= 1) or ( (i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Qmeans {}".format(i_iter)) U_centroids = _lambda * multi_dot(lst_factors[1:]) if i_iter > 0: objective_function[i_iter, 0] = compute_objective(X_data, U_centroids, indicator_vector) # Assign all points to the nearest centroid # first get distance from all points to all centroids distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) # then, Determine class membership of each point # by picking the closest centroid indicator_vector = np.argmin(distances, axis=1) objective_function[i_iter, 1] = compute_objective(X_data, U_centroids, indicator_vector) # Update centroid location using the newly # assigned data point classes for c in range(K_nb_cluster): X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0) # get the number of observation in each cluster cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) if len(counts) < K_nb_cluster: raise ValueError( "Some clusters have no point. Aborting iteration {}".format( i_iter)) diag_counts_sqrt = np.diag(np.sqrt( counts[cluster_names_sorted])) # todo use sparse matrix object diag_counts_sqrt_norm = np.linalg.norm( diag_counts_sqrt ) # todo analytic sqrt(n) instead of cumputing it with norm diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm # set it as first factor lst_factors[0] = diag_counts_sqrt_normalized if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) if hierarchical_inside: _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, # f_lambda_init=_lambda, f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) loss_palm_before = objective_palm[0, 0] loss_palm_after = objective_palm[-1, -1] else: _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1] ["finetune"], f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, graphical_display=False) loss_palm_before = objective_palm[0, -1] loss_palm_after = objective_palm[-1, -1] logger.debug("Loss palm before: {}".format(loss_palm_before)) logger.debug("Loss palm after: {}".format(loss_palm_after)) if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat, lst_factors_init, lst_factors, _lambda_tmp * multi_dot(lst_factors)) _lambda = _lambda_tmp / diag_counts_sqrt_norm logger.debug("Returned loss (with diag) palm: {}".format( objective_palm[-1, 0])) if i_iter >= 2: delta_objective_error = np.abs( objective_function[i_iter, 0] - objective_function[i_iter - 1, 0] ) / objective_function[ i_iter - 1, 0] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 U_centroids = _lambda * multi_dot(lst_factors[1:]) distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) indicator_vector = np.argmin(distances, axis=1) return objective_function[:i_iter], U_centroids, indicator_vector