예제 #1
0
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, gamma, landmarks):
    # verify sample size for evaluation
    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None

    # Make nystrom approximation
    # nys_obj = Nystroem(gamma=gamma, n_components=landmarks.shape[0])
    # nys_obj.fit(landmarks)
    # nystrom_embedding = nys_obj.transform(sample)
    landmarks_norm = get_squared_froebenius_norm_line_wise(landmarks)[:, np.newaxis]
    metric = prepare_nystrom(landmarks, landmarks_norm, gamma=gamma)
    nystrom_embedding = nystrom_transformation(sample, landmarks, metric, landmarks_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T

    # Create real kernel matrix
    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)

    # evaluation reconstruction error
    reconstruction_error_nystrom = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm

    # start svm + nystrom classification
    if x_test is not None:
        logger.info("Start classification")

        x_train_nystrom_embedding = nystrom_transformation(x_train, landmarks, metric, landmarks_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, landmarks, metric, landmarks_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

    else:
        accuracy_nystrom_svm = None

    return reconstruction_error_nystrom, accuracy_nystrom_svm
예제 #2
0
    def test_kernel(self):
        # compute kernel with special rbf kernel
        # compute kernel with sklearn kernel
        # compute kernel between sparse_data_ and sparse_data
        # compute_kernel between sparse_data and data
        # compute kernel between sparse_data and random_data
        # compute_kernel between data and random_data

        # sklearn_kernel_first = rbf_kernel(self.data, self.data, self.gamma)
        # sklearn_kernel_verylittle = rbf_kernel(self.data_verylittle, self.data_verylittle)
        for name_pair, pair in self.pairs_data.items():
            data_norm = self.norm_data[name_pair]
            gamma = self.gamma_data[name_pair]
            sklearn_kernel = rbf_kernel(pair, pair, gamma=gamma)

            special_kernel = special_rbf_kernel(pair,
                                                pair,
                                                gamma=gamma,
                                                norm_X=data_norm,
                                                norm_Y=data_norm.T,
                                                exp_outside=False)
            special_kernel_flag = special_rbf_kernel(pair,
                                                     pair,
                                                     gamma=gamma,
                                                     norm_X=data_norm,
                                                     norm_Y=data_norm.T,
                                                     exp_outside=True)
            special_kernel[special_kernel < 1e-12] = 0
            special_kernel_flag[special_kernel_flag < 1e-12] = 0
            sklearn_kernel[sklearn_kernel < 1e-12] = 0

            equality = np.allclose(sklearn_kernel, special_kernel)
            equality_flag = np.allclose(sklearn_kernel, special_kernel_flag)

            delta = np.linalg.norm(special_kernel - sklearn_kernel)
            delta_flag = np.linalg.norm(special_kernel_flag - sklearn_kernel)
            print("Delta flag: {}; delta: {}".format(delta_flag, delta))

            self.assertTrue(delta_flag < delta)
            self.assertTrue(equality, msg=name_pair)
            self.assertTrue(equality_flag, msg=name_pair)
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids):
    """
    Evaluation Nystrom construction time and approximation precision.

    The approximation is based on a subsample of size n_sample of the input data set.

    :param x_train: Input dataset as ndarray.
    :param U_centroids: The matrix of centroids as ndarray or SparseFactor object
    :param n_sample: The number of sample to take into account in the reconstruction (can't be too large)

    :return:
    """

    n_sample = paraman["--nystrom"]
    if n_sample > x_train.shape[0]:
        logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using "
                       "data size instead.".format(n_sample, x_train.shape[0]))
        n_sample = x_train.shape[0]
        paraman["--nystrom"] = n_sample

    # Compute euristic gamma as the mean of euclidian distance between example
    gamma = compute_euristic_gamma(x_train)
    log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation")
    # precompute the centroids norm for later use (optimization)
    centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis]
    # centroids_norm = None

    indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample]
    sample = x_train[indexes_samples]
    samples_norm = None
    log_memory_usage("Memory after sample selection in make_nystrom_evaluation")

    ########################
    # Nystrom on centroids #
    ########################
    logger.info("Build Nystrom on centroids")
    ## TIME: nystrom build time
    # nystrom build time is Nystrom preparation time for later use.
    ## START
    nystrom_build_start_time = time.process_time()
    metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma)
    nystrom_build_stop_time = time.process_time()
    log_memory_usage("Memory after SVD computation in make_nystrom_evaluation")
    # STOP
    nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time

    ## TIME: nystrom inference time
    # Nystrom inference time is the time for Nystrom transformation for all the samples.
    ## START
    nystrom_inference_time_start = time.process_time()
    nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T
    nystrom_inference_time_stop = time.process_time()
    log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation")
    ## STOP
    nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample

    ################################################################

    ######################
    # Nystrom on uniform #
    ######################
    logger.info("Build Nystrom on uniform sampling")

    indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]]
    uniform_sample = x_train[indexes_uniform_samples]
    uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis]
    log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation")

    metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma)
    log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation")

    nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma)
    nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T

    #################################################################

    ###############
    # Real Kernel #
    ###############
    logger.info("Compute real kernel matrix")

    real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm)
    # real_kernel = rbf_kernel(sample, sample, gamma)
    real_kernel_norm = np.linalg.norm(real_kernel_special)
    log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation")

    #################################
    # Sklearn based Nystrom uniform #
    #################################

    # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0])
    # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample)
    # sklearn_transfo = sklearn_nystrom.transform(sample)
    # kernel_sklearn_nys = sklearn_transfo  @ sklearn_transfo.T

    ################################################################

    ####################
    # Error evaluation #
    ####################

    sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm
    sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm

    # svm evaluation
    if x_test is not None:
        logger.info("Start classification")

        time_classification_start = time.process_time()
        x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma)
        x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma)

        linear_svc_clf = LinearSVC(class_weight="balanced")
        linear_svc_clf.fit(x_train_nystrom_embedding, y_train)
        predictions = linear_svc_clf.predict(x_test_nystrom_embedding)
        time_classification_stop = time.process_time()

        if paraman["--kddcup04"]:
            # compute recall: nb_true_positive/real_nb_positive
            recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1])
            # compute precision: nb_true_positive/nb_positive
            precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1])
            f1 = 2 * precision * recall / (precision + recall)
            accuracy_nystrom_svm = f1
        else:
            accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0]

        delta_time_classification = time_classification_stop - time_classification_start
    else:
        accuracy_nystrom_svm = None
        delta_time_classification = None

    nystrom_results = {
        "nystrom_build_time": nystrom_build_time,
        "nystrom_inference_time": nystrom_inference_time,
        "nystrom_sampled_error_reconstruction": sampled_froebenius_norm,
        "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform,
        "nystrom_svm_accuracy": accuracy_nystrom_svm,
        "nystrom_svm_time": delta_time_classification
    }

    resprinter.add(nystrom_results)