def make_batch_assignation_evaluation(X, centroids): """ Assign `size_batch` random samples of `X` to some of the centroids. All the samples are assigned at the same time using a matrix-vector multiplication. Time is recorded. :param X: The input data from which to take the samples. :param centroids: The centroids to which to assign the samples (must be of same dimension than `X`) :param size_batch: The number of data points to assign :return: None """ size_batch = paraman["--batch-assignation-time"] if size_batch > X.shape[0]: logger.warning( "Batch size for batch assignation evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(size_batch, X.shape[0])) size_batch = X.shape[0] paraman["--batch-assignation-time"] = size_batch # precomputed_centroid_norms = get_squared_froebenius_norm(centroids) precomputed_centroid_norms = None indexes_batch = np.random.permutation(X.shape[0])[:size_batch] start_time = time.time() _ = get_distances(X[indexes_batch], centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_time = time.time() resprinter.add({ "batch_assignation_mean_time": (stop_time - start_time) / size_batch, })
def check_cluster_integrity(X_data, X_centroids_hat, K_nb_cluster, counts, indicator_vector): """ Check for each cluster if it has data points in it. If not, create a new cluster from the data points of the most populated cluster so far. :param X_data: :param X_centroids_hat: :param K_nb_cluster: :param counts: :param indicator_vector: :return: """ for c in range(K_nb_cluster): cluster_data = X_data[indicator_vector == c] if len(cluster_data) == 0: biggest_cluster_index = np.argmax(counts) # type: int biggest_cluster_data_indexes_bool = indicator_vector == biggest_cluster_index biggest_cluster_actual_data_indexes = np.where( biggest_cluster_data_indexes_bool)[0] random_index_in_biggest_cluster = np.random.choice( biggest_cluster_actual_data_indexes, size=1)[0] random_point_in_biggest_cluster = X_data[ random_index_in_biggest_cluster] logger.warning( "cluster has lost data, add new cluster. cluster idx: {}". format(c)) X_centroids_hat[c] = random_point_in_biggest_cluster.reshape(1, -1) counts[biggest_cluster_index] -= 1 counts[c] = 1 indicator_vector[random_index_in_biggest_cluster] = c
def make_assignation_evaluation(X, centroids): nb_eval = paraman["--assignation-time"] if nb_eval > X.shape[0]: logger.warning( "Batch size for assignation evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(nb_eval, X.shape[0])) nb_eval = X.shape[0] paraman["--assignation-time"] = nb_eval times = [] precomputed_centroid_norms = get_squared_froebenius_norm_line_wise( centroids) for i in np.random.permutation(X.shape[0])[:nb_eval]: start_time = time.time() get_distances(X[i].reshape(1, -1), centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_time = time.time() times.append(stop_time - start_time) mean_time = np.mean(times) std_time = np.std(times) resprinter.add({ "assignation_mean_time": mean_time, "assignation_std_time": std_time })
def make_ami_evaluation(y_train, x_test, y_test, U_centroids, indicator_vector_train): n_sample = paraman["--ami"] if n_sample > y_train.shape[0]: logger.warning( "Batch size for ami evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, y_train.shape[0])) n_sample = y_train.shape[0] paraman["--nystrom"] = n_sample indexes_samples = np.random.permutation(y_train.shape[0])[:n_sample] y_train = y_train[indexes_samples] indicator_vector_train = indicator_vector_train[indexes_samples] if isinstance(U_centroids, SparseFactors): U_centroids = U_centroids.compute_product() indicator_vector_test, _ = assign_points_to_clusters(x_test, U_centroids) train_ami = adjusted_mutual_info_score(y_train, indicator_vector_train) test_ami = adjusted_mutual_info_score(y_test, indicator_vector_test) ami_results = { "train_ami": train_ami, "test_ami": test_ami, } resprinter.add(ami_results)
def update_clusters_with_integrity_check(X_data, X_data_norms, X_centroids_hat, K_nb_cluster, counts, indicator_vector, distances, cluster_names, cluster_names_sorted): """ Checki if no cluster has lost point and if yes, create a new cluster with the farthest point away in the cluster with the biggest population. All changes are made in place but for counts and cluster_names_sorted which are returned. :param X_data: :param X_data_norms: :param X_centroids_hat: :param K_nb_cluster: :param counts: :param indicator_vector: :param distances: :param cluster_names: :param cluster_names_sorted: :return: """ for c in range(K_nb_cluster): biggest_cluster_index = np.argmax(counts) # type: int biggest_cluster = cluster_names[biggest_cluster_index] biggest_cluster_data_indexes = indicator_vector == biggest_cluster index_of_farthest_point_in_biggest_cluster = np.argmax( distances[:, c][biggest_cluster_data_indexes]) farthest_point_in_biggest_cluster = X_data[ biggest_cluster_data_indexes][ index_of_farthest_point_in_biggest_cluster] absolute_index_of_farthest_point_in_biggest_cluster = np.where( biggest_cluster_data_indexes )[0][index_of_farthest_point_in_biggest_cluster] cluster_data = X_data[indicator_vector == c] if len(cluster_data) == 0: logger.warning( "cluster has lost data, add new cluster. cluster idx: {}". format(c)) X_centroids_hat[c] = farthest_point_in_biggest_cluster.reshape( 1, -1) counts = list(counts) counts[biggest_cluster_index] -= 1 counts.append(1) counts = np.array(counts) cluster_names_sorted = list(cluster_names_sorted) cluster_names_sorted.append(c) cluster_names_sorted = np.array(cluster_names_sorted) indicator_vector[ absolute_index_of_farthest_point_in_biggest_cluster] = c distances_to_new_cluster = get_distances( X_data, X_centroids_hat[c].reshape(1, -1), precomputed_data_points_norm=X_data_norms) distances[:, c] = distances_to_new_cluster.flatten() else: X_centroids_hat[c] = np.mean(X_data[indicator_vector == c, :], 0) return counts, cluster_names_sorted
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, gamma, landmarks): # verify sample size for evaluation n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None # Make nystrom approximation # nys_obj = Nystroem(gamma=gamma, n_components=landmarks.shape[0]) # nys_obj.fit(landmarks) # nystrom_embedding = nys_obj.transform(sample) landmarks_norm = get_squared_froebenius_norm_line_wise(landmarks)[:, np.newaxis] metric = prepare_nystrom(landmarks, landmarks_norm, gamma=gamma) nystrom_embedding = nystrom_transformation(sample, landmarks, metric, landmarks_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T # Create real kernel matrix real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm) # real_kernel = rbf_kernel(sample, sample, gamma) real_kernel_norm = np.linalg.norm(real_kernel_special) # evaluation reconstruction error reconstruction_error_nystrom = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm # start svm + nystrom classification if x_test is not None: logger.info("Start classification") x_train_nystrom_embedding = nystrom_transformation(x_train, landmarks, metric, landmarks_norm, None, gamma=gamma) x_test_nystrom_embedding = nystrom_transformation(x_test, landmarks, metric, landmarks_norm, None, gamma=gamma) linear_svc_clf = LinearSVC(class_weight="balanced") linear_svc_clf.fit(x_train_nystrom_embedding, y_train) predictions = linear_svc_clf.predict(x_test_nystrom_embedding) if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy_nystrom_svm = f1 else: accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0] else: accuracy_nystrom_svm = None return reconstruction_error_nystrom, accuracy_nystrom_svm
def build_df(path_results_dir, dct_output_files_by_root, col_to_delete=[]): lst_df_results = [] for root_name, dct_results in dct_output_files_by_root.items(): try: result_file = path_results_dir / dct_results["results"] df_expe = pd.read_csv(result_file) df_expe["oar_id"] = root_name lst_df_results.append(df_expe) except KeyError: logger.warning( "No 'results' entry for root name {}".format(root_name)) df_results = pd.concat(lst_df_results) for c in col_to_delete: df_results = df_results.drop([c], axis=1) return df_results
def prepare_nystrom(landmarks, landmarks_norm, gamma): """ Return the K^{-1/2} matrix of Nyström: the metric used for the transformation. It uses the rbf kernel. :param landmarks: The matrix of landmark points :param landmarks_norm: The norm of the matrix of landmark points :param gamma: The gamma value to use in the rbf kernel. :return: """ landmarks_norm_T = landmarks_norm.T if hasattr(landmarks_norm, "T") else None basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma, landmarks_norm, landmarks_norm_T) U, S, V = scipy.linalg.svd(basis_kernel_W) Sprim = np.maximum(S, 1e-12) if (Sprim != S).any(): logger.warning("One value of S in singular decomposition of W was lower than 1e-12") S = Sprim normalization_ = np.dot(U / np.sqrt(S), V) return normalization_
""" from copy import deepcopy import numpy as np from numpy.linalg import norm from numpy.linalg import multi_dot import matplotlib.pyplot as plt from qkmeans.palm.utils import compute_objective_function from qkmeans.utils import get_side_prod, logger # TODO avoid conversions between dense ndarray and sparse matrices # TODO init palm with SparseFactors logger.warning( "The module {} shouldn't be used because it hasn't been maintained in a long time" .format(__file__)) def palm4msa(arr_X_target: np.array, lst_S_init: list, nb_factors: int, lst_projection_functions: list, f_lambda_init: float, nb_iter: int, update_right_to_left=True, graphical_display=False): """ lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product). example: S5 S4 S3 S2 S1
import matplotlib.pyplot as plt import logging mpl_logger = logging.getLogger("matplotlib") mpl_logger.setLevel(logging.WARNING) import copy import numpy as np from qkmeans.core.utils import compute_objective, assign_points_to_clusters, get_squared_froebenius_norm_line_wise from qkmeans.utils import logger, DataGenerator from sklearn import datasets logger.warning( "Module {} hasn't been tested and shouldn't be used. It is work in progress" .format(__file__)) def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization, batch_size): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param initialization: The (K, d) matrix of centroids at initialization. :param batch_size: The size of each batch. :return: """
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis] # centroids_norm = None indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None log_memory_usage("Memory after sample selection in make_nystrom_evaluation") ######################## # Nystrom on centroids # ######################## logger.info("Build Nystrom on centroids") ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.process_time() metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma) nystrom_build_stop_time = time.process_time() log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.process_time() nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T nystrom_inference_time_stop = time.process_time() log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation") ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample ################################################################ ###################### # Nystrom on uniform # ###################### logger.info("Build Nystrom on uniform sampling") indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]] uniform_sample = x_train[indexes_uniform_samples] uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis] log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation") metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma) log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation") nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T ################################################################# ############### # Real Kernel # ############### logger.info("Compute real kernel matrix") real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm) # real_kernel = rbf_kernel(sample, sample, gamma) real_kernel_norm = np.linalg.norm(real_kernel_special) log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation") ################################# # Sklearn based Nystrom uniform # ################################# # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0]) # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample) # sklearn_transfo = sklearn_nystrom.transform(sample) # kernel_sklearn_nys = sklearn_transfo @ sklearn_transfo.T ################################################################ #################### # Error evaluation # #################### sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm # svm evaluation if x_test is not None: logger.info("Start classification") time_classification_start = time.process_time() x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma) x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma) linear_svc_clf = LinearSVC(class_weight="balanced") linear_svc_clf.fit(x_train_nystrom_embedding, y_train) predictions = linear_svc_clf.predict(x_test_nystrom_embedding) time_classification_stop = time.process_time() if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy_nystrom_svm = f1 else: accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0] delta_time_classification = time_classification_stop - time_classification_start else: accuracy_nystrom_svm = None delta_time_classification = None nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm, "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform, "nystrom_svm_accuracy": accuracy_nystrom_svm, "nystrom_svm_time": delta_time_classification } resprinter.add(nystrom_results)
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids, indicator_vector): def scikit_evaluation(str_type): clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type) clf.fit(x_train, y_train) start_inference_time = time.time() predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_{}_inference_time".format(str_type): inference_time, "1nn_{}_accuracy".format(str_type): accuracy } resprinter.add(results_1nn) return inference_time def kmean_tree_evaluation(): lst_clf_by_cluster = [ KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit( x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0]) ] start_inference_time = time.time() distances = get_distances(x_test, U_centroids) indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time logger.info("1 nearest neighbor with k-means search") kmean_tree_time = kmean_tree_evaluation() if paraman["kmeans"]: lst_knn_types = ["brute", "ball_tree", "kd_tree"] for knn_type in lst_knn_types: signal.signal(signal.SIGALRM, timeout_signal_handler) signal.alarm(int(kmean_tree_time * 10)) try: logger.info( "1 nearest neighbor with {} search".format(knn_type)) scikit_evaluation(knn_type) except TimeoutError as te: logger.warning( "Timeout during execution of 1-nn with {} version: {}". format(knn_type, te)) signal.alarm(0)
from collections import OrderedDict from pprint import pformat import numpy as np from numpy.linalg import multi_dot from qkmeans.palm.palm import hierarchical_palm4msa, palm4msa from qkmeans.core.kmeans import kmeans from qkmeans.core.utils import build_constraint_set_smart, compute_objective, get_distances, get_squared_froebenius_norm_line_wise from qkmeans.utils import visual_evaluation_palm4msa from sklearn import datasets import matplotlib.pyplot as plt from qkmeans.utils import logger logger.warning( "The module {} hasn't been maintained in a long time and shouldn't be used anymore." .format(__file__)) def qmeans(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, hierarchical_inside=False, graphical_display=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration.
for idx_bar, xcoor in enumerate(x_indices + bar_width * idx_sparsy_val): try: nb_param = df_sparsy_val[ df_sparsy_val["--nb-cluster"] == nb_cluster_values[idx_bar]][ "nb_param_centroids"].mean() ax.text(xcoor, mean_task_values[idx_bar] + std_task_values[idx_bar], '{}'.format(int(round(nb_param))), horizontalalignment='center', verticalalignment='bottom', rotation='vertical') except: logger.warning("nb param empty") continue except Exception as e: if "empty dataframe" in str(e): logger.warning("{} for sparsy val {}".format( str(e), sparsy_val)) else: raise e try: # kmeans palm ############## df_sparsy_val_kmeans_palm = df_hierarchical_kmeans_palm[ df_hierarchical_kmeans_palm["--sparsity-factor"] == sparsy_val]
def get_dct_result_files_by_root(src_results_dir, old_filename_objective=False, tpl_results=("centroids", "results", "objective")): """ From a directory with the result of oarjobs give the dictionnary of results file for each experiment. Files are: * OAR.`jobid`.stderr * OAR.`jobid`.stdout * `idexpe`_objective_`nameobjective`.csv contains the objective function values; * `idexpe`_results.csv contains the parameters of the experiments and the various metric measures; * `idexpe`_centrois.npy contains the numpy objects of centroids that have been decided. where: * `jobid` correspond to oar's own job identifier; * `nameobjective` correspond to the name of the objective function being printed; * `idexpe` correspond to the name The returned dictionnary gives: { "OAR.`jobid`": { "centroids": "`idexpe`_centroids.npy", "results": "`idexpe`_results.csv", "objective": "`idexpe`_objective_`objective_name`.csv" } } :param src_results_dir: path to :return: """ files = src_results_dir.glob('**/*') files = [x for x in files if x.is_file()] lst_str_filenames = [file.name for file in files] dct_output_files_by_root = {} count_complete = 0 count_has_printed_results = 0 count_total = 0 for pth_file in files: if pth_file.suffix != '.stdout' and pth_file.suffix != '.out': continue # if "_results.csv" not in pth_file.name: # continue count_total += 1 with open(pth_file, 'r') as stdoutfile: lines = stdoutfile.readlines() for i_line, lin in enumerate(lines): if lin[:2] == "--": break else: logger.warning("file {} didn't contain anything".format( pth_file.name)) dct_output_files_by_root[pth_file.stem] = {} continue count_has_printed_results += 1 data = "".join(lines[i_line:i_line + 2]) io_data = StringIO(data) df = pd.read_csv(io_data) try: root_name = df["--output-file_resprinter"][0].split("_")[0] except KeyError: logger.warning("no key for resprinter in {}".format(pth_file.name)) dct_files = {} complete = True if old_filename_objective: used_output_file_end_re = output_file_end_re_old else: used_output_file_end_re = output_file_end_re for type_file, root_re in used_output_file_end_re.items(): if type_file not in tpl_results: continue forged_re_compiled = re.compile(r"{}".format(root_name) + root_re) try: dct_files[type_file] = list( filter(forged_re_compiled.match, lst_str_filenames))[0] except IndexError: logger.warning("{} not found for root name {}".format( type_file, root_name)) complete = False if complete: count_complete += 1 dct_output_files_by_root[pth_file.stem] = dct_files return dct_output_files_by_root
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids, indicator_vector): """ Do the 1-nearest neighbor classification using `x_train`, `y_train` as support and `x_test`, `y_test` as evaluation set. The scikilearn classifiers (brute, kdtree and balltree) are called only in the case where it is the kmeans version of the program that is called (for simplicity purposes: not do it many times). Time is recorded. Classification accuracy is recorded. :param x_train: Train data set as ndarray. :param y_train: Train labels as categories in ndarray. :param x_test: Test data as ndarray. :param y_test: Test labels as categories. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param indicator_vector: The indicator vector for this matrix of centroids and this train data. :return: """ def scikit_evaluation(str_type): """ Do the scikit learn version of nearest neighbor (used for comparison) :param str_type: :return: """ clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type) clf.fit(x_train, y_train) log_memory_usage( "Memory after definition of neighbors classifiers in scikit_evaluation of make_1nn_evaluation" ) start_inference_time = time.time() predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() log_memory_usage( "Memory after label assignation in scikit_evaluation of make_1nn_evaluation" ) inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_{}_inference_time".format(str_type): inference_time, "1nn_{}_accuracy".format(str_type): accuracy } resprinter.add(results_1nn) return inference_time def kmean_tree_evaluation(): """ Do the K-means partitioning version of nearest neighbor?=. :return: """ # for each cluster, there is a sub nearest neighbor classifier for points in that cluster. lst_clf_by_cluster = [ KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit( x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0]) ] log_memory_usage( "Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation" ) # precomputed_centroid_norms = get_squared_froebenius_norm(landmarks) precomputed_centroid_norms = None start_inference_time = time.time() distances = get_distances( x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_get_distances_time = time.time() get_distance_time = stop_get_distances_time - start_inference_time log_memory_usage( "Memory after distances computation with clusters in kmean_tree_evaluation of make_1nn_evaluation" ) indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): # get the cluster to which belongs this data point and call the associated nearest neighbor classifier idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() log_memory_usage( "Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation" ) inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_get_distance_time": get_distance_time / x_test.shape[0], "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time logger.info("1 nearest neighbor with k-means search") kmean_tree_time = kmean_tree_evaluation() # if paraman["kmeans"]: lst_knn_types = ["brute", "ball_tree", "kd_tree"] for knn_type in lst_knn_types: # the classification must not take more than 10 times the time taken for the K means 1 nn classification or # it will stop. signal.signal(signal.SIGALRM, timeout_signal_handler) signal.alarm(int(kmean_tree_time * 10)) # start alarm try: logger.info( "1 nearest neighbor with {} search".format(knn_type)) scikit_evaluation(knn_type) except TimeoutError as te: logger.warning( "Timeout during execution of 1-nn with {} version: {}". format(knn_type, te)) signal.alarm(0) # stop alarm for next evaluation
def make_nystrom_evaluation(x_train, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning( "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage( "Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) # centroids_norm = get_squared_froebenius_norm(landmarks) centroids_norm = None ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.time() basis_kernel_W = special_rbf_kernel(U_centroids, U_centroids, gamma, centroids_norm, centroids_norm) log_memory_usage( "Memory after K_11 computation in make_nystrom_evaluation") U, S, V = np.linalg.svd(basis_kernel_W) log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") S = np.maximum(S, 1e-12) normalization_ = np.dot(U / np.sqrt(S), V) nystrom_build_stop_time = time.time() # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] log_memory_usage( "Memory after sample selection in make_nystrom_evaluation") # samples_norm = np.linalg.norm(sample, axis=1) ** 2 samples_norm = None real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm, samples_norm) log_memory_usage( "Memory after real kernel computation in make_nystrom_evaluation") ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.time() nystrom_embedding = special_rbf_kernel(U_centroids, sample, gamma, centroids_norm, samples_norm).T @ normalization_ log_memory_usage( "Memory after embedding computation in make_nystrom_evaluation") nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T log_memory_usage( "Memory after kernel matrix approximation in make_nystrom_evaluation") nystrom_inference_time_stop = time.time() ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel) nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm } resprinter.add(nystrom_results)
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ def prepare_nystrom(landmarks, landmarks_norm): basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma, landmarks_norm, landmarks_norm) U, S, V = np.linalg.svd(basis_kernel_W) S = np.maximum(S, 1e-12) normalization_ = np.dot(U / np.sqrt(S), V) return normalization_ def nystrom_transformation(x_input, landmarks, p_metric, landmarks_norm, x_input_norm): nystrom_embedding = special_rbf_kernel(landmarks, x_input, gamma, landmarks_norm, x_input_norm).T @ p_metric return nystrom_embedding n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning( "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage( "Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids) # centroids_norm = None indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None log_memory_usage( "Memory after sample selection in make_nystrom_evaluation") ######################## # Nystrom on centroids # ######################## logger.info("Build Nystrom on centroids") ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.process_time() metric = prepare_nystrom(U_centroids, centroids_norm) nystrom_build_stop_time = time.process_time() log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.process_time() nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T nystrom_inference_time_stop = time.process_time() log_memory_usage( "Memory after kernel matrix approximation in make_nystrom_evaluation") ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample ################################################################ ###################### # Nystrom on uniform # ###################### logger.info("Build Nystrom on uniform sampling") indexes_uniform_samples = np.random.permutation( x_train.shape[0])[:U_centroids.shape[0]] uniform_sample = x_train[indexes_uniform_samples] uniform_sample_norm = None log_memory_usage( "Memory after uniform sample selection in make_nystrom_evaluation") metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm) log_memory_usage( "Memory after SVD computation in uniform part of make_nystrom_evaluation" ) nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm) nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T ################################################################# ############### # Real Kernel # ############### logger.info("Compute real kernel matrix") real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm, samples_norm) real_kernel_norm = np.linalg.norm(real_kernel) log_memory_usage( "Memory after real kernel computation in make_nystrom_evaluation") ################################################################ #################### # Error evaluation # #################### sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel) / real_kernel_norm sampled_froebenius_norm_uniform = np.linalg.norm( nystrom_approx_kernel_value_uniform - real_kernel) / real_kernel_norm # svm evaluation if x_test is not None: logger.info("Start classification") time_classification_start = time.process_time() x_train_nystrom_embedding = nystrom_transformation( x_train, U_centroids, metric, centroids_norm, None) x_test_nystrom_embedding = nystrom_transformation( x_test, U_centroids, metric, centroids_norm, None) linear_svc_clf = LinearSVC() linear_svc_clf.fit(x_train_nystrom_embedding, y_train) accuracy_nystrom_svm = linear_svc_clf.score(x_test_nystrom_embedding, y_test) time_classification_stop = time.process_time() delta_time_classification = time_classification_stop - time_classification_start else: accuracy_nystrom_svm = None delta_time_classification = None nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm, "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform, "nystrom_svm_accuracy": accuracy_nystrom_svm, "nystrom_svm_time": delta_time_classification } resprinter.add(nystrom_results)