def make_nystrom_evaluation(x_train, y_train, x_test, y_test, gamma, landmarks): # verify sample size for evaluation n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None # Make nystrom approximation # nys_obj = Nystroem(gamma=gamma, n_components=landmarks.shape[0]) # nys_obj.fit(landmarks) # nystrom_embedding = nys_obj.transform(sample) landmarks_norm = get_squared_froebenius_norm_line_wise(landmarks)[:, np.newaxis] metric = prepare_nystrom(landmarks, landmarks_norm, gamma=gamma) nystrom_embedding = nystrom_transformation(sample, landmarks, metric, landmarks_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T # Create real kernel matrix real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm) # real_kernel = rbf_kernel(sample, sample, gamma) real_kernel_norm = np.linalg.norm(real_kernel_special) # evaluation reconstruction error reconstruction_error_nystrom = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm # start svm + nystrom classification if x_test is not None: logger.info("Start classification") x_train_nystrom_embedding = nystrom_transformation(x_train, landmarks, metric, landmarks_norm, None, gamma=gamma) x_test_nystrom_embedding = nystrom_transformation(x_test, landmarks, metric, landmarks_norm, None, gamma=gamma) linear_svc_clf = LinearSVC(class_weight="balanced") linear_svc_clf.fit(x_train_nystrom_embedding, y_train) predictions = linear_svc_clf.predict(x_test_nystrom_embedding) if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy_nystrom_svm = f1 else: accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0] else: accuracy_nystrom_svm = None return reconstruction_error_nystrom, accuracy_nystrom_svm
def test_kernel(self): # compute kernel with special rbf kernel # compute kernel with sklearn kernel # compute kernel between sparse_data_ and sparse_data # compute_kernel between sparse_data and data # compute kernel between sparse_data and random_data # compute_kernel between data and random_data # sklearn_kernel_first = rbf_kernel(self.data, self.data, self.gamma) # sklearn_kernel_verylittle = rbf_kernel(self.data_verylittle, self.data_verylittle) for name_pair, pair in self.pairs_data.items(): data_norm = self.norm_data[name_pair] gamma = self.gamma_data[name_pair] sklearn_kernel = rbf_kernel(pair, pair, gamma=gamma) special_kernel = special_rbf_kernel(pair, pair, gamma=gamma, norm_X=data_norm, norm_Y=data_norm.T, exp_outside=False) special_kernel_flag = special_rbf_kernel(pair, pair, gamma=gamma, norm_X=data_norm, norm_Y=data_norm.T, exp_outside=True) special_kernel[special_kernel < 1e-12] = 0 special_kernel_flag[special_kernel_flag < 1e-12] = 0 sklearn_kernel[sklearn_kernel < 1e-12] = 0 equality = np.allclose(sklearn_kernel, special_kernel) equality_flag = np.allclose(sklearn_kernel, special_kernel_flag) delta = np.linalg.norm(special_kernel - sklearn_kernel) delta_flag = np.linalg.norm(special_kernel_flag - sklearn_kernel) print("Delta flag: {}; delta: {}".format(delta_flag, delta)) self.assertTrue(delta_flag < delta) self.assertTrue(equality, msg=name_pair) self.assertTrue(equality_flag, msg=name_pair)
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis] # centroids_norm = None indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None log_memory_usage("Memory after sample selection in make_nystrom_evaluation") ######################## # Nystrom on centroids # ######################## logger.info("Build Nystrom on centroids") ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.process_time() metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma) nystrom_build_stop_time = time.process_time() log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.process_time() nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T nystrom_inference_time_stop = time.process_time() log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation") ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample ################################################################ ###################### # Nystrom on uniform # ###################### logger.info("Build Nystrom on uniform sampling") indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]] uniform_sample = x_train[indexes_uniform_samples] uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis] log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation") metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma) log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation") nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T ################################################################# ############### # Real Kernel # ############### logger.info("Compute real kernel matrix") real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm) # real_kernel = rbf_kernel(sample, sample, gamma) real_kernel_norm = np.linalg.norm(real_kernel_special) log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation") ################################# # Sklearn based Nystrom uniform # ################################# # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0]) # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample) # sklearn_transfo = sklearn_nystrom.transform(sample) # kernel_sklearn_nys = sklearn_transfo @ sklearn_transfo.T ################################################################ #################### # Error evaluation # #################### sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm # svm evaluation if x_test is not None: logger.info("Start classification") time_classification_start = time.process_time() x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma) x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma) linear_svc_clf = LinearSVC(class_weight="balanced") linear_svc_clf.fit(x_train_nystrom_embedding, y_train) predictions = linear_svc_clf.predict(x_test_nystrom_embedding) time_classification_stop = time.process_time() if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy_nystrom_svm = f1 else: accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0] delta_time_classification = time_classification_stop - time_classification_start else: accuracy_nystrom_svm = None delta_time_classification = None nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm, "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform, "nystrom_svm_accuracy": accuracy_nystrom_svm, "nystrom_svm_time": delta_time_classification } resprinter.add(nystrom_results)