def scikit_evaluation(str_type): """ Do the scikit learn version of nearest neighbor (used for comparison) :param str_type: :return: """ clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type) clf.fit(x_train, y_train) log_memory_usage( "Memory after definition of neighbors classifiers in scikit_evaluation of make_1nn_evaluation" ) start_inference_time = time.time() predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() log_memory_usage( "Memory after label assignation in scikit_evaluation of make_1nn_evaluation" ) inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_{}_inference_time".format(str_type): inference_time, "1nn_{}_accuracy".format(str_type): accuracy } resprinter.add(results_1nn) return inference_time
def kmean_tree_evaluation(): """ Do the K-means partitioning version of nearest neighbor?=. :return: """ # for each cluster, there is a sub nearest neighbor classifier for points in that cluster. lst_clf_by_cluster = [KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0])] log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation") precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids) # precomputed_centroid_norms = None start_inference_time = time.process_time() distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_get_distances_time = time.process_time() get_distance_time = stop_get_distances_time - start_inference_time indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): # get the cluster to which belongs this data point and call the associated nearest neighbor classifier idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.process_time() log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation") inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_get_distance_time": get_distance_time / x_test.shape[0], "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time
def kmean_tree_evaluation(): """ Do the K-means partitioning version of nearest neighbor?=. :return: """ # for each cluster, there is a sub nearest neighbor classifier for points in that cluster. lst_clf_by_cluster = [] indices_no_train_obs_in_cluster = [] for i in range(U_centroids.shape[0]): try: lst_clf_by_cluster.append(KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit(x_train[indicator_vector == i], y_train[indicator_vector == i])) except ValueError: indices_no_train_obs_in_cluster.append(i) lst_clf_by_cluster.append(None) # lst_clf_by_cluster = [ for i in range(landmarks.shape[0])] log_memory_usage("Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation") precomputed_centroid_norms = get_squared_froebenius_norm_line_wise(U_centroids) # precomputed_centroid_norms = None start_inference_time = time.process_time() distances = get_distances(x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_get_distances_time = time.process_time() get_distance_time = stop_get_distances_time - start_inference_time if len(indices_no_train_obs_in_cluster): distances[:, np.array(indices_no_train_obs_in_cluster)] = np.inf indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): # get the cluster to which belongs this data point and call the associated nearest neighbor classifier idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.process_time() log_memory_usage("Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation") inference_time = (stop_inference_time - start_inference_time) if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy = f1 else: accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_get_distance_time": get_distance_time / x_test.shape[0], "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time
def process_palm_on_top_of_kmeans(kmeans_centroids): lst_constraint_sets, lst_constraint_sets_desc = build_constraint_set_smart( left_dim=kmeans_centroids.shape[0], right_dim=kmeans_centroids.shape[1], nb_factors=paraman["--nb-factors"] + 1, sparsity_factor=paraman["--sparsity-factor"], residual_on_right=paraman["--residual-on-right"], fast_unstable_proj=True) lst_factors = init_lst_factors(*kmeans_centroids.shape, paraman["--nb-factors"] + 1) eye_norm = np.sqrt(kmeans_centroids.shape[0]) if paraman["--hierarchical"]: _lambda_tmp, op_factors, U_centroids, nb_iter_by_factor, objective_palm = \ hierarchical_palm4msa( arr_X_target=np.eye(kmeans_centroids.shape[0]) @ kmeans_centroids, lst_S_init=lst_factors, lst_dct_projection_function=lst_constraint_sets, f_lambda_init=1. * eye_norm, nb_iter=paraman["--nb-iteration-palm"], update_right_to_left=True, residual_on_right=paraman["--residual-on-right"], delta_objective_error_threshold_palm=paraman["--delta-threshold"], track_objective_palm=False) else: _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \ palm4msa(arr_X_target=np.eye(kmeans_centroids.shape[0]) @ kmeans_centroids, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_constraint_sets[-1]["finetune"], f_lambda_init=1. * eye_norm, nb_iter=paraman["--nb-iteration-palm"], update_right_to_left=True, delta_objective_error_threshold=paraman["--delta-threshold"], track_objective=False) log_memory_usage( "Memory after palm on top of kmeans in process_palm_on_top_of_kmeans") _lambda = _lambda_tmp / eye_norm lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) return op_centroids
def scikit_evaluation(str_type): """ Do the scikit learn version of nearest neighbor (used for comparison) :param str_type: :return: """ clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type) clf.fit(x_train, y_train) log_memory_usage( "Memory after definition of neighbors classifiers in scikit_evaluation of make_1nn_evaluation" ) start_inference_time = time.process_time() predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.process_time() log_memory_usage( "Memory after label assignation in scikit_evaluation of make_1nn_evaluation" ) inference_time = (stop_inference_time - start_inference_time) if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1]) / np.sum( y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1]) / np.sum( predictions[predictions == 1]) f1 = 2 * precision * recall / (precision + recall) accuracy = f1 else: accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_{}_inference_time".format(str_type): inference_time, "1nn_{}_accuracy".format(str_type): accuracy } resprinter.add(results_1nn) return inference_time
def make_nystrom_evaluation(x_train, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning( "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage( "Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) # centroids_norm = get_squared_froebenius_norm(landmarks) centroids_norm = None ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.time() basis_kernel_W = special_rbf_kernel(U_centroids, U_centroids, gamma, centroids_norm, centroids_norm) log_memory_usage( "Memory after K_11 computation in make_nystrom_evaluation") U, S, V = np.linalg.svd(basis_kernel_W) log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") S = np.maximum(S, 1e-12) normalization_ = np.dot(U / np.sqrt(S), V) nystrom_build_stop_time = time.time() # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] log_memory_usage( "Memory after sample selection in make_nystrom_evaluation") # samples_norm = np.linalg.norm(sample, axis=1) ** 2 samples_norm = None real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm, samples_norm) log_memory_usage( "Memory after real kernel computation in make_nystrom_evaluation") ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.time() nystrom_embedding = special_rbf_kernel(U_centroids, sample, gamma, centroids_norm, samples_norm).T @ normalization_ log_memory_usage( "Memory after embedding computation in make_nystrom_evaluation") nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T log_memory_usage( "Memory after kernel matrix approximation in make_nystrom_evaluation") nystrom_inference_time_stop = time.time() ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel) nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm } resprinter.add(nystrom_results)
track_objective=False) log_memory_usage( "Memory after palm on top of kmeans in process_palm_on_top_of_kmeans") _lambda = _lambda_tmp / eye_norm lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) return op_centroids if __name__ == "__main__": logger.info("Command line: " + " ".join(sys.argv)) log_memory_usage("Memory at startup") arguments = docopt.docopt(__doc__) paraman = ParameterManager(arguments) initialized_results = dict((v, None) for v in lst_results_header) resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"]) resprinter.add(initialized_results) resprinter.add(paraman) objprinter = ObjectiveFunctionPrinter( output_file=paraman["--output-file_objprinter"]) has_failed = False if paraman["--verbose"]: daiquiri.setup(level=logging.DEBUG) else: daiquiri.setup(level=logging.INFO) try:
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis] # centroids_norm = None indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None log_memory_usage("Memory after sample selection in make_nystrom_evaluation") ######################## # Nystrom on centroids # ######################## logger.info("Build Nystrom on centroids") ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.process_time() metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma) nystrom_build_stop_time = time.process_time() log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.process_time() nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T nystrom_inference_time_stop = time.process_time() log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation") ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample ################################################################ ###################### # Nystrom on uniform # ###################### logger.info("Build Nystrom on uniform sampling") indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]] uniform_sample = x_train[indexes_uniform_samples] uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis] log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation") metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma) log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation") nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T ################################################################# ############### # Real Kernel # ############### logger.info("Compute real kernel matrix") real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm) # real_kernel = rbf_kernel(sample, sample, gamma) real_kernel_norm = np.linalg.norm(real_kernel_special) log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation") ################################# # Sklearn based Nystrom uniform # ################################# # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0]) # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample) # sklearn_transfo = sklearn_nystrom.transform(sample) # kernel_sklearn_nys = sklearn_transfo @ sklearn_transfo.T ################################################################ #################### # Error evaluation # #################### sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm # svm evaluation if x_test is not None: logger.info("Start classification") time_classification_start = time.process_time() x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma) x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma) linear_svc_clf = LinearSVC(class_weight="balanced") linear_svc_clf.fit(x_train_nystrom_embedding, y_train) predictions = linear_svc_clf.predict(x_test_nystrom_embedding) time_classification_stop = time.process_time() if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy_nystrom_svm = f1 else: accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0] delta_time_classification = time_classification_stop - time_classification_start else: accuracy_nystrom_svm = None delta_time_classification = None nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm, "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform, "nystrom_svm_accuracy": accuracy_nystrom_svm, "nystrom_svm_time": delta_time_classification } resprinter.add(nystrom_results)
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ def prepare_nystrom(landmarks, landmarks_norm): basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma, landmarks_norm, landmarks_norm) U, S, V = np.linalg.svd(basis_kernel_W) S = np.maximum(S, 1e-12) normalization_ = np.dot(U / np.sqrt(S), V) return normalization_ def nystrom_transformation(x_input, landmarks, p_metric, landmarks_norm, x_input_norm): nystrom_embedding = special_rbf_kernel(landmarks, x_input, gamma, landmarks_norm, x_input_norm).T @ p_metric return nystrom_embedding n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning( "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage( "Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids) # centroids_norm = None indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None log_memory_usage( "Memory after sample selection in make_nystrom_evaluation") ######################## # Nystrom on centroids # ######################## logger.info("Build Nystrom on centroids") ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.process_time() metric = prepare_nystrom(U_centroids, centroids_norm) nystrom_build_stop_time = time.process_time() log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.process_time() nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T nystrom_inference_time_stop = time.process_time() log_memory_usage( "Memory after kernel matrix approximation in make_nystrom_evaluation") ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample ################################################################ ###################### # Nystrom on uniform # ###################### logger.info("Build Nystrom on uniform sampling") indexes_uniform_samples = np.random.permutation( x_train.shape[0])[:U_centroids.shape[0]] uniform_sample = x_train[indexes_uniform_samples] uniform_sample_norm = None log_memory_usage( "Memory after uniform sample selection in make_nystrom_evaluation") metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm) log_memory_usage( "Memory after SVD computation in uniform part of make_nystrom_evaluation" ) nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm) nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T ################################################################# ############### # Real Kernel # ############### logger.info("Compute real kernel matrix") real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm, samples_norm) real_kernel_norm = np.linalg.norm(real_kernel) log_memory_usage( "Memory after real kernel computation in make_nystrom_evaluation") ################################################################ #################### # Error evaluation # #################### sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel) / real_kernel_norm sampled_froebenius_norm_uniform = np.linalg.norm( nystrom_approx_kernel_value_uniform - real_kernel) / real_kernel_norm # svm evaluation if x_test is not None: logger.info("Start classification") time_classification_start = time.process_time() x_train_nystrom_embedding = nystrom_transformation( x_train, U_centroids, metric, centroids_norm, None) x_test_nystrom_embedding = nystrom_transformation( x_test, U_centroids, metric, centroids_norm, None) linear_svc_clf = LinearSVC() linear_svc_clf.fit(x_train_nystrom_embedding, y_train) accuracy_nystrom_svm = linear_svc_clf.score(x_test_nystrom_embedding, y_test) time_classification_stop = time.process_time() delta_time_classification = time_classification_stop - time_classification_start else: accuracy_nystrom_svm = None delta_time_classification = None nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm, "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform, "nystrom_svm_accuracy": accuracy_nystrom_svm, "nystrom_svm_time": delta_time_classification } resprinter.add(nystrom_results)