def generate_kernelized_tsne_mapping_function(parameters=settings.parameters,
                                              regenerate_parameters_cache=False
                                              ):
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    kernelized_tsne_parameters_cache = load_all_kernelized_tsne_embedders(
        parameters=parameters,
        regenerate_parameters_cache=regenerate_parameters_cache)

    def kernel_tsne_mapping(x, k=1):
        '''
        Getting kernel tSNE. Starting from scratch, so use all data at once.
        '''
        # Let's go for reliable option.

        cache = kernelized_tsne_parameters_cache["%.2f" % k]

        y = np.zeros((x.shape[0], Y_mnist.shape[1]))
        for i in range(len(x)):
            square_distances = np.sum((X_mnist - x[i, :])**2, axis=1).reshape(
                (1, -1))
            kernel_values = np.exp(-square_distances / (2 * cache['sigma']**2))
            kernel_values = kernel_values / np.sum(kernel_values)
            y[i, :] = kernel_values.dot(cache['coefs']).reshape(
                (-1, Y_mnist.shape[1]))
        return y

    return kernel_tsne_mapping
def get_common_info(parameters):
    res = {}
    res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters)
    res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters)
    res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters)
    letter_samples, _, _ = generate_data.load_letters(parameters=parameters)
    res['letter_samples'] = letter_samples
    D_Y = distance.squareform(distance.pdist(res['Y_mnist']))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    res['nearest_neighbors_y_dist'] = np.min(D_Y, axis=1)  # Actually, whatever axis
    return res
def load_all_kernelized_tsne_embedders(parameters=settings.parameters,
                                       regenerate_parameters_cache=False):
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)

    # Implementing carefully. Not the fastest, but the most reliable way.
    kernelized_tsne_parameters_cache = dict()
    cache_filename = generate_cache_filename(parameters=parameters)

    if not regenerate_parameters_cache and os.path.isfile(cache_filename):
        with open(cache_filename, 'rb') as f:
            kernelized_tsne_parameters_cache = pickle.load(f)
    else:
        D = distance.squareform(distance.pdist(X_mnist))

        step = 0.01
        choice_K = np.arange(step, 2 + step, step)  # Let's try those K.

        np.fill_diagonal(D, np.inf)
        closest_neighbor_dist = np.min(D, axis=1).reshape((1, -1))
        np.fill_diagonal(D, 0)

        # Sigma is a multiply over closest NN distance
        for k in choice_K:
            key = "%.2f" % k
            if k not in kernelized_tsne_parameters_cache or regenerate_parameters_cache:
                kernelized_tsne_parameters_cache[key] = dict()
                # Creating matrix to get coefficients using SLE
                sigma_matrix = k * np.repeat(
                    closest_neighbor_dist, X_mnist.shape[0], axis=0)

                kernel_matrix = np.exp(-D**2 / (2 * sigma_matrix**2))
                kernel_matrix = kernel_matrix / np.sum(
                    kernel_matrix, axis=1).reshape(
                        (-1, 1))  # Normalizing  by rows

                coefs = np.linalg.inv(kernel_matrix).dot(Y_mnist)
                kernelized_tsne_parameters_cache[key]['coefs'] = coefs
                kernelized_tsne_parameters_cache[key]['sigma'] = sigma_matrix[
                    0, :]
                logging.info("Got coefs for coefficient %f", k)
        with open(cache_filename, 'wb') as f:
            pickle.dump(kernelized_tsne_parameters_cache, f)
    return kernelized_tsne_parameters_cache
def get_common_info(parameters):
    res = {}
    res['dTSNE_mnist'] = generate_data.load_dtsne_mnist(parameters=parameters)
    res['Y_mnist'] = generate_data.load_y_mnist(parameters=parameters)
    res['X_mnist'] = generate_data.load_x_mnist(parameters=parameters)
    res['labels_mnist'] = generate_data.load_labels_mnist(
        parameters=parameters)
    res['picked_neighbors'] = generate_data.load_picked_neighbors(
        parameters=parameters)
    res['picked_neighbors_labels'] = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    res['accuracy_nn'] = parameters.get("accuracy_nn",
                                        settings.parameters["accuracy_nn"])
    D_Y = distance.squareform(distance.pdist(res['Y_mnist']))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    res['nearest_neighbors_y_dist'] = np.min(D_Y,
                                             axis=1)  # Actually, whatever axis
    return res
def main(parameters=settings.parameters, regenerate=False):
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)

    result = dict()

    output_file = \
        cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(output_prefix, parameters)

    if os.path.isfile(output_file) and not regenerate:
        with open(output_file, "rb") as f:
            result = pickle.load(f)
            logging.info("Previous result loaded")
    else:
        logging.info("No previous result or regeneration requested")

    for fname_prefix in original_files_prefixes:
        cluster_results_file = \
            cluster_lion_RBF_IDW_commons.generate_cluster_results_filename(fname_prefix, parameters)
        logging.info("Processing file: %s", cluster_results_file)
        with open(cluster_results_file, 'rb') as f:
            res = pickle.load(f)
            for i in res.keys():
                logging.info("Processing method: %s", i)
                if i not in result or regenerate:

                    precision = calc_precision(res[i]["EmbeddedPoints"],
                                               X_mnist, Y_mnist,
                                               picked_neighbors, precision_nn)
                    logging.info("%s precision: %f (accuracy was %f)", i,
                                 precision, res[i]["Accuracy"])
                    result[i] = precision

                    with open(output_file, "wb") as f:
                        pickle.dump(result, f)
def main(parameters = settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn", settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn", settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== Some starting stuff
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y) ** 2, axis=1)
        return np.argsort(y_distances)[:n]

    kernelized_results_file = exp_cluster_attr_test_kernelized.generate_cluster_results_filename(parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_tsne_method_results, kernelized_detailed_tsne_accuracy, \
        kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time, kernelized_detailed_tsne_method_list = pickle.load(f)
    ind = [4, 24, 49]
    kernelized_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] + kernelized_detailed_tsne_method_list[i][-8:]
        for i in ind]
    kernelized_method_results = [kernelized_detailed_tsne_method_results[i] for i in ind]

    kernelized_accuracy = np.zeros((len(kernelized_method_list,)))
    kernelized_precision = np.zeros((len(kernelized_method_list,)))
    kernelized_per_item_time = np.zeros((len(kernelized_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    kernelized_nearest_neighbors_percentiles_matrix = np.zeros((len(picked_neighbors), len(kernelized_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_method_results[j][i, :]
            kernelized_dist = np.min(np.sqrt(np.sum((Y_mnist - y) ** 2, axis=1)))
            kernelized_nearest_neighbors_percentiles_matrix[i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                                                            kernelized_dist)
    kernelized_distance_percentiles = np.mean(kernelized_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_method_list)):
        logging.info("%s %f", kernelized_method_list[j], kernelized_distance_percentiles[j])

    # ============================== Accuracy and precision
    for j in range(len(kernelized_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors),))
        per_sample_precision = np.zeros((len(picked_neighbors),))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]

            y = kernelized_method_results[j][i,:]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x, X_mnist, n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y, Y_mnist, n=precision_nn)
            matching_indices = len([k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(kernelized_method_results[j][i,:], Y_mnist, n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(obtained_labels==expected_label) / len(obtained_labels)
        kernelized_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_precision[j] = np.mean(per_sample_precision)
        kernelized_per_item_time[j] = kernelized_detailed_tsne_time[j] / len(picked_neighbors)
        logging.info("%s :\t%f\t%f", kernelized_method_list[j], kernelized_precision[j],
                     kernelized_accuracy[j])

    kernelized_kl = np.zeros((len(kernelized_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_kernelized_performance_file = generate_kernelized_kl_temp_filename(parameters)
    if os.path.isfile(kl_kernelized_performance_file):
        with open(kl_kernelized_performance_file, 'rb') as f:
            kernelized_kl, processed_indices = pickle.load(f)

    # ============================== KL divergence
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors),))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape((1, -1))), axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate((Y_mnist, kernelized_method_results[j][i, :].reshape((1, -1))), axis=0)
            kernelized_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_performance_file, 'wb') as f:
            pickle.dump((kernelized_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_kl = np.mean(kernelized_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_accuracy, kernelized_precision,
                     kernelized_avg_kl, kernelized_per_item_time, kernelized_distance_percentiles),f)
Exemplo n.º 7
0
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)

    # =================== ACCURACY
    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_cluster_attr_test_GD.generate_cluster_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (picked_neighbors_y_gd_transformed,
         picked_neighbors_y_gd_variance_recalc_transformed,
         picked_neighbors_y_gd_transformed_random,
         picked_neighbors_y_gd_variance_recalc_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed_random,
         picked_neighbors_y_gd_early_exagg_transformed,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_method_results = [
        picked_neighbors_y_gd_transformed,
        picked_neighbors_y_gd_transformed_random,
        picked_neighbors_y_gd_variance_recalc_transformed,
        picked_neighbors_y_gd_variance_recalc_transformed_random,
        picked_neighbors_y_gd_early_exagg_transformed,
        picked_neighbors_y_gd_early_exagg_transformed_random,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed,
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_cluster_attr_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed, \
        picked_neighbors_y_time_gd_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_early_exagg_transformed, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(picked_neighbors_y_time_gd_transformed),
        np.mean(picked_neighbors_y_time_gd_transformed_random),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed),
        np.mean(picked_neighbors_y_time_gd_variance_recalc_transformed_random),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed),
        np.mean(picked_neighbors_y_time_gd_early_exagg_transformed_random),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed
        ),
        np.mean(
            picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random
        ),
    ]

    gd_accuracy = np.zeros((len(gd_method_list, )))
    gd_precision = np.zeros((len(gd_method_list, )))

    # ============================== Distance percentiles
    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis
    gd_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(gd_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(gd_method_list)):
            y = gd_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    gd_distance_percentiles = np.mean(gd_nearest_neighbors_percentiles_matrix,
                                      axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s :\t%f", gd_method_list[j], gd_distance_percentiles[j])

    # ============================== KL divergence
    for j in range(len(gd_method_results)):
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            nn_indices = get_nearest_neighbors_in_y(gd_method_results[j][i, :],
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            x = picked_neighbors[i, :]
            y = gd_method_results[j][i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [i for i in nn_x_indices if i in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

        gd_accuracy[j] = np.mean(per_sample_accuracy)
        gd_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f", gd_method_list[j], gd_precision[j],
                     gd_accuracy[j])

    gd_kl = np.zeros((len(gd_method_list), len(picked_neighbors)))

    processed_indices = list()

    kl_gd_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_performance_file):
        with open(kl_gd_performance_file, 'rb') as f:
            gd_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(picked_neighbors), ))
    for i in range(len(picked_neighbors)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set
            | settings.x_neighbors_selection_parameter_set, parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, picked_neighbors[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_method_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_method_results[j][i, :].reshape((1, -1))), axis=0)
            gd_kl[j,
                  i], _ = lion_tsne.kl_divergence_and_gradient(p_matrix=new_P,
                                                               y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_performance_file, 'wb') as f:
            pickle.dump((gd_kl, processed_indices), f)
    # This should be fast
    gd_avg_kl = np.mean(gd_kl, axis=1)

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_accuracy, gd_precision, gd_time,
                     gd_avg_kl, gd_distance_percentiles), f)
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    letter_samples, _, _ = generate_data.load_letters(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    # ============== KL Divergence
    gd_method_list = [
        r'Closest $Y_{init}$', r'Random $Y_{init}$',
        r'Closest $Y_{init}$; new $\sigma$',
        r'Random $Y_{init}$; new $\sigma$', r'Closest $Y_{init}$; EE',
        r'Random $Y_{init}$; EE', r'Closest $Y_{init}$; new $\sigma$; EE',
        r'Random $Y_{init}$; new $\sigma$; EE'
    ]

    gd_results_file = exp_letter_test_GD.generate_letter_results_filename(
        parameters=parameters)
    with open(gd_results_file, 'rb') as f:
        (letters_y_gd_transformed, letters_y_gd_variance_recalc_transformed,
         letters_y_gd_transformed_random,
         letters_y_gd_variance_recalc_transformed_random,
         letters_y_gd_early_exagg_transformed_random,
         letters_y_gd_early_exagg_transformed,
         letters_y_gd_variance_recalc_early_exagg_transformed_random,
         picked_random_starting_positions,
         letters_y_gd_variance_recalc_early_exagg_transformed,
         covered_samples) = pickle.load(f)

    gd_letters_results = [
        letters_y_gd_transformed,
        letters_y_gd_transformed_random,
        letters_y_gd_variance_recalc_transformed,
        letters_y_gd_variance_recalc_transformed_random,
        letters_y_gd_early_exagg_transformed,
        letters_y_gd_early_exagg_transformed_random,
        letters_y_gd_variance_recalc_early_exagg_transformed,
        letters_y_gd_variance_recalc_early_exagg_transformed_random,
    ]

    input_time_file = exp_letter_test_GD.generate_time_results_filename(
        parameters)
    with open(input_time_file, 'rb') as f:
        letters_y_time_gd_transformed, letters_y_time_gd_variance_recalc_transformed, \
        letters_y_time_gd_transformed_random, \
        letters_y_time_gd_variance_recalc_transformed_random, \
        letters_y_time_gd_early_exagg_transformed_random, \
        letters_y_time_gd_early_exagg_transformed, \
        letters_y_time_gd_variance_recalc_early_exagg_transformed_random, \
        letters_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples = pickle.load(f)

    gd_time = [
        np.mean(letters_y_time_gd_transformed),
        np.mean(letters_y_time_gd_transformed_random),
        np.mean(letters_y_time_gd_variance_recalc_transformed),
        np.mean(letters_y_time_gd_variance_recalc_transformed_random),
        np.mean(letters_y_time_gd_early_exagg_transformed),
        np.mean(letters_y_time_gd_early_exagg_transformed_random),
        np.mean(letters_y_time_gd_variance_recalc_early_exagg_transformed),
        np.mean(
            letters_y_time_gd_variance_recalc_early_exagg_transformed_random),
    ]

    gd_letters_kl = np.zeros((len(gd_method_list), len(letter_samples)))

    processed_indices = list()

    kl_gd_letters_performance_file = generate_gd_kl_temp_filename(parameters)
    if os.path.isfile(kl_gd_letters_performance_file):
        with open(kl_gd_letters_performance_file, 'rb') as f:
            gd_letters_kl, processed_indices = pickle.load(f)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(letter_samples), ))
    for i in range(len(letter_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, letter_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(gd_letters_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, gd_letters_results[j][i, :].reshape((1, -1))),
                axis=0)
            gd_letters_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(
                p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_gd_letters_performance_file, 'wb') as f:
            pickle.dump((gd_letters_kl, processed_indices), f)
    # This should be fast
    gd_avg_letters_kl = np.mean(gd_letters_kl, axis=1)

    # ============== Distance percentiles
    gd_letters_percentiles_matrix = np.zeros(
        (len(letter_samples), len(gd_method_list)))
    gd_letters_distance_matrix = np.zeros(
        (len(letter_samples), len(gd_method_list)))
    for i in range(len(letter_samples)):
        for j in range(len(gd_method_list)):
            y = gd_letters_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            gd_letters_distance_matrix[i, j] = nn_dist
            gd_letters_percentiles_matrix[i, j] = stats.percentileofscore(
                nearest_neighbors_y_dist, nn_dist)
    gd_letters_distance_percentiles = np.mean(gd_letters_percentiles_matrix,
                                              axis=0)
    gd_letters_distances = np.mean(gd_letters_distance_matrix, axis=0)
    for j in range(len(gd_method_list)):
        logging.info("%s: %f, %f", gd_method_list[j], gd_letters_distances[j],
                     gd_letters_distance_percentiles[j])

    output_file = generate_gd_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((gd_method_list, gd_time, gd_avg_letters_kl,
                     gd_letters_distance_percentiles), f)
Exemplo n.º 9
0
def generate_idw_power_performance(*,
                                   regenerate=False,
                                   recursive_regenerate=False,
                                   parameters=settings.parameters):
    global_idw_power_performance = dict()  # Start from scratch
    global_idw_power_performance_abs = dict()  # Start from scratch
    global_idw_accuracy = dict()
    global_idw_precision = dict()

    start_time = datetime.datetime.now()
    logging.info("IDW power experiment started: %s", start_time)
    idw_power_performance_file = generate_idw_power_filename(parameters)
    idw_power_plot_file = generate_idw_power_plot_filename(parameters)

    X_mnist = generate_data.load_x_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    labels_mnist = generate_data.load_labels_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in idw_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)

    if os.path.isfile(idw_power_performance_file) and not regenerate:
        with open(idw_power_performance_file, 'rb') as f:
            global_idw_power_performance, global_idw_power_performance_abs, global_idw_accuracy = pickle.load(
                f)
    else:
        logging.info("Regeneration requested")

    for p in idw_power_options:
        if p in global_idw_power_performance:
            logging.info("Loaded p %f", p)
            continue

        logging.info("Processing p %f", p)

        interpolator = dTSNE_mnist.generate_embedding_function(
            embedding_function_type='weighted-inverse-distance',
            function_kwargs={'power': p})

        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))

        for i in range(len(picked_neighbors)):
            expected_label = picked_neighbor_labels[i]
            result = interpolator(picked_neighbors[i], verbose=0)
            nn_indices = get_nearest_neighbors_in_y(result,
                                                    Y_mnist,
                                                    n=accuracy_nn)
            obtained_labels = labels_mnist[nn_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)

            y = result
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)
        cur_acc = np.mean(per_sample_accuracy)
        cur_prec = np.mean(per_sample_precision)

        y_sum_square_dist = 0.0
        y_sum_abs_dist = 0.0
        y_abs_dist = 0.0
        y_count = 0.0
        for i in range(len(X_mnist)):
            distances = distance_matrix[i, :].copy()
            # distances[i] = np.inf #Not interested in distance to itself
            # Step 1. Find nearest neighbors in the neighborhood.
            neighbor_indices = list(range(X_mnist.shape[0]))
            neighbor_indices.remove(i)
            num_neighbors = len(neighbor_indices)
            weights = 1 / distances[neighbor_indices]**p
            weights = weights / np.sum(weights)
            cur_y_result = weights.dot(Y_mnist[neighbor_indices, :])
            y_sum_square_dist += np.sum(cur_y_result - Y_mnist[i, :])**2
            y_sum_abs_dist += np.sqrt(np.sum(cur_y_result - Y_mnist[i, :])**2)
            y_count += 1.0

        global_idw_power_performance[p] = y_sum_square_dist / y_count
        global_idw_power_performance_abs[p] = y_sum_abs_dist / y_count
        global_idw_accuracy[p] = cur_acc
        global_idw_precision[p] = cur_prec

        # Just in case it will become unstable due to too few neighbors
        # lion_power_plot_data[(p, perc)]['PowerSquareDistSum'] = y_sum_square_dist
        # lion_power_plot_data[(p, perc)]['PowerSquareDistCount'] = y_count

        with open(idw_power_performance_file, 'wb') as f:
            pickle.dump((global_idw_power_performance,
                         global_idw_power_performance_abs, global_idw_accuracy,
                         global_idw_precision), f)

    EPS = 1e-5
    y = list()
    x_global = list()
    for cur_power in idw_power_options:
        closest_power = [
            i for i in global_idw_power_performance_abs
            if np.abs(i - cur_power) < EPS
        ]
        if len(closest_power) > 0:
            x_global.append(cur_power)
            y.append(global_idw_power_performance[closest_power[0]])
    idw_optimal_power = x_global[np.argmin(y)]

    with open(idw_power_plot_file, 'wb') as f:
        pickle.dump((x_global, y, idw_optimal_power), f)
    logging.info("IDW optimal power: %f", idw_optimal_power)

    end_time = datetime.datetime.now()
    logging.info("IDW power experiment ended: %s", end_time)
    logging.info("IDW power experiment duration: %s", end_time - start_time)
import numpy as np

import cluster_lion_RBF_IDW_commons
import exp_cluster_attr_test_IDW_RBF
import exp_cluster_attr_test_LION
import exp_cluster_postprocess_Kernelized
import exp_lion_power_performance
import exp_cluster_postprocess_GD
import exp_cluster_postprocess_RBF_IDW_LION
import exp_cluster_attr_test_IDW_higher
import logging

logging.basicConfig(level=logging.INFO)

parameters = settings.parameters
Y_mnist = generate_data.load_y_mnist(parameters=parameters)
picked_indices = generate_data.load_nearest_training_indices(
    parameters=parameters)
picked_indices_y_mnist = Y_mnist[picked_indices, :]
dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
baseline_accuracy = generate_data.get_baseline_accuracy(parameters=parameters)

lion_power_plot_data_file = exp_lion_power_performance.generate_lion_power_plot_filename(
    parameters=parameters)

baseline_precision = generate_data.load_baseline_precision(
    parameters=parameters)

with open(lion_power_plot_data_file, 'rb') as f:
    _, _, lion_optimal_power = pickle.load(f)
def main(parameters=settings.parameters):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_y_mnist(parameters=parameters)

    outlier_samples, _ = generate_data.load_outliers(parameters=parameters)

    nn_results_file = exp_outlier_test_NN.generate_outlier_results_filename(
        parameters)
    with open(nn_results_file, 'rb') as f:
        nn_outliers_results, nn_models_orig, nn_method_list = pickle.load(f)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    # ================ KL DIVERGENCE ===================
    nn_outliers_kl = np.zeros((len(nn_method_list), len(outlier_samples)))

    processed_indices = list()

    kl_nn_outliers_performance_file = generate_nn_kl_temp_filename(parameters)

    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(outlier_samples), ))
    for i in range(len(outlier_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.outlier_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(i) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, outlier_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(nn_outliers_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (nn_models_orig[j], nn_outliers_results[j][i, :].reshape(
                    (1, -1))),
                axis=0)
            nn_outliers_kl[j, i], _ = lion_tsne.kl_divergence_and_gradient(
                p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_nn_outliers_performance_file, 'wb') as f:
            pickle.dump((nn_outliers_kl, processed_indices), f)
    # This should be fast
    nn_avg_outliers_kl = np.mean(nn_outliers_kl, axis=1)

    # ================ DISTANCE MATRICES ===================
    nn_outliers_percentiles_matrix = np.zeros(
        (len(outlier_samples), len(nn_method_list)))
    nn_outliers_distance_matrix = np.zeros(
        (len(outlier_samples), len(nn_method_list)))
    for i in range(len(outlier_samples)):
        for j in range(len(nn_method_list)):
            y = nn_outliers_results[j][i, :]
            nn_dist = np.min(
                np.sqrt(np.sum((nn_models_orig[j] - y)**2, axis=1)))
            nn_outliers_distance_matrix[i, j] = nn_dist
            nn_outliers_percentiles_matrix[i, j] = stats.percentileofscore(
                nearest_neighbors_y_dist, nn_dist)
    nn_outliers_distance_percentiles = np.mean(nn_outliers_percentiles_matrix,
                                               axis=0)
    nn_outliers_distances = np.mean(nn_outliers_distance_matrix, axis=0)
    for j in range(len(nn_method_list)):
        print(nn_method_list[j], nn_outliers_distances[j],
              nn_outliers_distance_percentiles[j])

    output_file = generate_nn_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((nn_method_list, nn_avg_outliers_kl,
                     nn_outliers_distance_percentiles), f)
def main(parameters=settings.parameters, only_time=False):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    letter_A_samples, _ = generate_data.load_A_letters(parameters=parameters)

    # Doing it from scratch takes REALLY long time. If possible, save results & pre-load

    covered_samples = list()

    first_sample_inc = 0  # Change only if it is one of "Other notebooks just for parallelization"
    last_sample_exclusive = len(letter_A_samples)
    output_file = generate_letter_A_results_filename(parameters)
    output_time_file = generate_time_results_filename(parameters)

    letter_As_y_gd_transformed = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))
    letter_As_y_gd_variance_recalc_transformed = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))
    letter_As_y_gd_transformed_random = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))
    letter_As_y_gd_variance_recalc_transformed_random = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))

    letter_As_y_gd_early_exagg_transformed_random = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))
    letter_As_y_gd_early_exagg_transformed = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))
    letter_As_y_gd_variance_recalc_early_exagg_transformed_random = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))
    letter_As_y_gd_variance_recalc_early_exagg_transformed = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))

    letter_As_random_starting_positions = np.zeros(
        (len(letter_A_samples), Y_mnist.shape[1]))

    letter_As_y_time_gd_transformed = np.zeros((len(letter_A_samples), ))
    letter_As_y_time_gd_variance_recalc_transformed = np.zeros(
        (len(letter_A_samples), ))
    letter_As_y_time_gd_transformed_random = np.zeros(
        (len(letter_A_samples), ))
    letter_As_y_time_gd_variance_recalc_transformed_random = np.zeros(
        (len(letter_A_samples), ))

    letter_As_y_time_gd_early_exagg_transformed_random = np.zeros(
        (len(letter_A_samples), ))
    letter_As_y_time_gd_early_exagg_transformed = np.zeros(
        (len(letter_A_samples), ))
    letter_As_y_time_gd_variance_recalc_early_exagg_transformed_random = np.zeros(
        (len(letter_A_samples), ))
    letter_As_y_time_gd_variance_recalc_early_exagg_transformed = np.zeros(
        (len(letter_A_samples), ))

    for i in range(first_sample_inc, last_sample_exclusive):
        np.random.seed(
            i
        )  # We reset random seed every time. Otherwise, if you load partial results from file, everything
        # will depend on which parts were loaded, random sequence will "shift" depend on that, and reproducibility will be lost.
        # I.e. if put seed(0) before the loop and start from scratch, then you will have some random sequence [abc] for sample 0,
        # other (continuation of that sequence) [def] for sample 1, etc. But if you already loaded sample 0 from file, you will
        # have [abc] for sample 1, [def] for sample 2, etc. Reproducibility should not depend on what parts are loaded.
        # Hence, random seed every time, and it depends on ABSOLUTE sample number.
        logging.info(" ====================== Sample %d \n\n", i)
        if i in covered_samples:
            logging.info("Already loaded.")
        else:
            letter_A = letter_A_samples[i].reshape((1, -1))

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_transformed[i, :] = dTSNE_mnist.transform(
                letter_A,
                y='closest',
                verbose=2,
                optimizer_kwargs={'early_exaggeration': None})
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_transformed[i] = (
                embedder_end_time - embedder_start_time).total_seconds()
            logging.info("Time: %f s", letter_As_y_time_gd_transformed[i])

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_variance_recalc_transformed[
                i, :] = dTSNE_mnist.transform(
                    letter_A,
                    keep_sigmas=False,
                    y='closest',
                    verbose=2,
                    optimizer_kwargs={'early_exaggeration': None})
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_variance_recalc_transformed[i] = \
                (embedder_end_time - embedder_start_time).total_seconds()
            logging.info("Time (VR): %f s",
                         letter_As_y_time_gd_variance_recalc_transformed[i])

            # Let's pick random starts at any point. not necessary near the center.
            y_start = np.array([[
                np.random.uniform(np.min(Y_mnist[:, 0]), np.max(Y_mnist[:,
                                                                        0])),
                np.random.uniform(np.min(Y_mnist[:, 1]), np.max(Y_mnist[:, 1]))
            ]])

            letter_As_random_starting_positions[i, :] = y_start

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_transformed_random[i, :] = dTSNE_mnist.transform(
                letter_A,
                y=y_start,  # y='random',
                verbose=2,
                optimizer_kwargs={'early_exaggeration': None})
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_transformed_random[i] = \
                (embedder_end_time - embedder_start_time).total_seconds()
            logging.info("Time (random): %f s",
                         letter_As_y_time_gd_transformed_random[i])

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_variance_recalc_transformed_random[
                i, :] = dTSNE_mnist.transform(
                    letter_A,
                    keep_sigmas=False,
                    y=y_start,
                    # y='random',
                    verbose=2,
                    optimizer_kwargs={'early_exaggeration': None})
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_variance_recalc_transformed_random[i] = \
                (embedder_end_time - embedder_start_time).total_seconds()
            logging.info(
                "Time (VR, random): %f s",
                letter_As_y_time_gd_variance_recalc_transformed_random[i])

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_early_exagg_transformed_random[
                i, :] = dTSNE_mnist.transform(
                    letter_A,
                    y=y_start,
                    # y='random',
                    verbose=2)
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_early_exagg_transformed_random[i] = \
                (embedder_end_time - embedder_start_time).total_seconds()
            logging.info("Time (EE, random): %f s",
                         letter_As_y_time_gd_early_exagg_transformed_random[i])

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_early_exagg_transformed[
                i, :] = dTSNE_mnist.transform(letter_A, y='closest', verbose=2)
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_early_exagg_transformed[i] = \
                (embedder_end_time - embedder_start_time).total_seconds()
            logging.info("Time (EE): %f s",
                         letter_As_y_time_gd_early_exagg_transformed[i])

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_variance_recalc_early_exagg_transformed_random[
                i, :] = dTSNE_mnist.transform(letter_A,
                                              y=y_start,
                                              keep_sigmas=False,
                                              verbose=2)
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_variance_recalc_early_exagg_transformed_random[i] = \
                (embedder_end_time - embedder_start_time).total_seconds()
            logging.info(
                "Time (VR,EE,random): %f s",
                letter_As_y_time_gd_variance_recalc_early_exagg_transformed_random[
                    i])

            embedder_start_time = datetime.datetime.now()
            letter_As_y_gd_variance_recalc_early_exagg_transformed[
                i, :] = dTSNE_mnist.transform(letter_A,
                                              keep_sigmas=False,
                                              y='closest',
                                              verbose=2)
            embedder_end_time = datetime.datetime.now()
            letter_As_y_time_gd_variance_recalc_early_exagg_transformed[i] = \
                (embedder_end_time - embedder_start_time).total_seconds()
            logging.info(
                "Time (VR,EE): %f s",
                letter_As_y_time_gd_variance_recalc_early_exagg_transformed[i])

        covered_samples.append(i)
        logging.info("Saving...")
        # Gradient descent results take a long while. Let's cache.
        if not only_time:
            with open(output_file, 'wb') as f:
                pickle.dump((
                    letter_As_y_gd_transformed,
                    letter_As_y_gd_variance_recalc_transformed,
                    letter_As_y_gd_transformed_random,
                    letter_As_y_gd_variance_recalc_transformed_random,
                    letter_As_y_gd_early_exagg_transformed_random,
                    letter_As_y_gd_early_exagg_transformed,
                    letter_As_y_gd_variance_recalc_early_exagg_transformed_random,
                    letter_As_random_starting_positions,
                    letter_As_y_gd_variance_recalc_early_exagg_transformed,
                    covered_samples), f)
        with open(output_time_file, 'wb') as f:
            pickle.dump((
                letter_As_y_time_gd_transformed,
                letter_As_y_time_gd_variance_recalc_transformed,
                letter_As_y_time_gd_transformed_random,
                letter_As_y_time_gd_variance_recalc_transformed_random,
                letter_As_y_time_gd_early_exagg_transformed_random,
                letter_As_y_time_gd_early_exagg_transformed,
                letter_As_y_time_gd_variance_recalc_early_exagg_transformed_random,
                letter_As_y_time_gd_variance_recalc_early_exagg_transformed,
                covered_samples), f)
Exemplo n.º 13
0
def generate_lion_power_performance(*,
                                    regenerate=False,
                                    recursive_regenerate=False,
                                    parameters=settings.parameters):
    start_time = datetime.datetime.now()
    logging.info("LION power experiment started: %s", start_time)

    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    lion_power_performance_data_file = generate_lion_power_performance_filename(
        parameters)
    lion_power_plot_data_file = generate_lion_power_plot_filename(parameters)

    lion_power_performance_data = dict()  # Start from scratch

    X_mnist = generate_data.load_x_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    labels_mnist = generate_data.load_labels_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    dTSNE_mnist = generate_data.load_dtsne_mnist(
        parameters=settings.parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=settings.parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=settings.parameters)

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in lion_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)
    logging.info("Radius X: %s", radius_x)

    if os.path.isfile(lion_power_performance_data_file) and not regenerate:
        with open(lion_power_performance_data_file, 'rb') as f:
            lion_power_performance_data = pickle.load(f)

    for perc in lion_percentile_options:
        for p in lion_power_options:
            logging.info("Processing percentile and power: %f, %d", p, perc)
            key = str(perc) + ";" + "%.3f" % (p)
            logging.info("Key: %s", key)
            if key not in lion_power_performance_data:
                lion_power_performance_data[key] = dict()

            if 'Accuracy' not in lion_power_performance_data[key]:
                logging.info(
                    "Accuracy not found for power %f percentile %d. \tCalculating...",
                    p, perc)
                interpolator = dTSNE_mnist.generate_lion_tsne_embedder(
                    verbose=0,
                    random_state=0,
                    function_kwargs={
                        'radius_x_percentile': perc,
                        'power': p
                    })

                per_sample_accuracy = np.zeros((len(picked_neighbors), ))
                per_sample_precision = np.zeros((len(picked_neighbors), ))

                for i in range(len(picked_neighbors)):
                    # if i%100==0:
                    #    print("\tPower: ",p,"Processing:",i)
                    expected_label = picked_neighbor_labels[i]
                    result = interpolator(picked_neighbors[i], verbose=0)
                    nn_indices = get_nearest_neighbors_in_y(result,
                                                            Y_mnist,
                                                            n=accuracy_nn)
                    obtained_labels = labels_mnist[nn_indices]
                    per_sample_accuracy[i] = sum(
                        obtained_labels == expected_label) / len(
                            obtained_labels)

                    y = result
                    x = picked_neighbors[i, :]
                    nn_x_indices = get_nearest_neighbors_in_y(x,
                                                              X_mnist,
                                                              n=precision_nn)
                    nn_y_indices = get_nearest_neighbors_in_y(y,
                                                              Y_mnist,
                                                              n=precision_nn)
                    matching_indices = len(
                        [k for k in nn_x_indices if k in nn_y_indices])
                    per_sample_precision[i] = (matching_indices / precision_nn)

                cur_acc = np.mean(per_sample_accuracy)
                cur_prec = np.mean(per_sample_precision)
                # print('================= ',p,perc, cur_acc)
                lion_power_performance_data[key]['Accuracy'] = cur_acc
                lion_power_performance_data[key]['Precision'] = cur_prec
                with open(lion_power_performance_data_file, 'wb') as f:
                    pickle.dump(lion_power_performance_data, f)
            else:
                logging.info(
                    "Accuracy FOUND for power %f percentile %d. Using loaded.",
                    p, perc)

            if 'PowerSquareDist' not in lion_power_performance_data[
                    key] or regenerate:
                logging.info(
                    "Power performance not found for power %f percentile %d.\tCalculating...",
                    p, perc)

                y_sum_square_dist = 0.0
                y_sum_abs_dist = 0.0
                y_count = 0.0
                for i in range(len(X_mnist)):
                    distances = distance_matrix[i, :].copy()
                    distances[
                        i] = np.inf  # Not interested in distance to itself
                    # Step 1. Find nearest neighbors in the neighborhood.
                    neighbor_indices = np.where(distances <= radius_x[perc])[0]
                    num_neighbors = len(neighbor_indices)
                    if num_neighbors >= 2:  # Below 2? Cannot interpolate
                        # We are good
                        weights = 1 / distances[neighbor_indices]**p
                        weights = weights / np.sum(weights)
                        cur_y_result = weights.dot(
                            Y_mnist[neighbor_indices, :])
                        y_sum_square_dist += np.sum(cur_y_result -
                                                    Y_mnist[i, :])**2
                        y_sum_abs_dist += np.sqrt(
                            np.sum(cur_y_result - Y_mnist[i, :])**2)
                        y_count += 1.0
                new_dict = dict()
                new_dict['PowerSquareDist'] = y_sum_square_dist / y_count
                new_dict['PowerAbsDist'] = y_sum_abs_dist / y_count
                # Just in case it will become unstable due to too few neighbors
                new_dict['PowerSquareDistSum'] = y_sum_square_dist
                new_dict['PowerSquareDistCount'] = y_count
                for ndk in new_dict.keys():
                    lion_power_performance_data[key][ndk] = new_dict[ndk]

                with open(lion_power_performance_data_file, 'wb') as f:
                    pickle.dump(lion_power_performance_data, f)
            else:
                logging.info(
                    "Power FOUND for power %f percentile %d. Using loaded.", p,
                    perc)

            logging.info("%s %s", key, lion_power_performance_data[key])

    lion_optimal_power = dict()
    lion_power_plot_y = dict()
    for perc in lion_percentile_options:
        y = list()
        for cur_power in lion_power_options:
            key = str(perc) + ";%.3f" % (cur_power)
            # print(cur_power, perc, lion_power_plot_data[key])
            y.append(lion_power_performance_data[key]['PowerSquareDist'])
        lion_power_plot_y[perc] = y
        lion_optimal_power[perc] = lion_power_options[np.argmin(y)]

    with open(lion_power_plot_data_file, 'wb') as f:
        pickle.dump(
            (lion_power_options, lion_power_plot_y, lion_optimal_power), f)
    logging.info("LION optimal power: %s", lion_optimal_power)

    end_time = datetime.datetime.now()
    logging.info("LION power experiment ended: %s", end_time)
    logging.info("LION power experiment duration: %s", end_time - start_time)
def main(*, regenerate=False, parameters=settings.parameters):
    start_time = datetime.datetime.now()
    logging.info("LION debug letter experiment started: %s", start_time)

    common_info = get_common_info(parameters)
    results = dict()
    embedders = generate_all_embedders(common_info['dTSNE_mnist'])

    for embedder_name in embedders.keys():
        process_single_embedder(embedder=embedders[embedder_name], embedder_name=embedder_name, results=results,
                regenerate=regenerate, common_info=common_info,
                                parameters=parameters)

    end_time = datetime.datetime.now()
    logging.info("letter experiment ended: %s", end_time)
    logging.info("letter experiment duration: %s", end_time-start_time)

    _, _, lion_optimal_power = exp_lion_power_performance.load_lion_power_plot()
    lion_method_list = ["LION; $r_x$ at %dth perc.; $p$=%.1f" % (i, lion_optimal_power[i])
                        for i in sorted(lion_optimal_power)]

    lion90_name = [i for i in results.keys() if i.startswith('LION-90')][0]
    letters_y_lion90 = results[lion90_name]['EmbeddedPoints']
    lion95_name = [i for i in results.keys() if i.startswith('LION-95')][0]
    letters_y_lion95 = results[lion95_name]['EmbeddedPoints']
    lion99_name = [i for i in results.keys() if i.startswith('LION-99')][0]
    letters_y_lion99 = results[lion99_name]['EmbeddedPoints']
    lion100_name = [i for i in results.keys() if i.startswith('LION-100')][0]
    letters_y_lion100 = results[lion100_name]['EmbeddedPoints']

    cur_shown_letter_indices_begin = 0
    cur_shown_letter_indices_end = 20

    #for k in ['LION-90-16.4']:  # embedders.keys():
    #    print(k ,embedders[k](common_info['letter_samples']
    #                                [cur_shown_letter_indices_begin:cur_shown_letter_indices_end]))

    embedding_function = common_info['dTSNE_mnist'].generate_lion_tsne_embedder(
        function_kwargs={'radius_x_percentile': 90, 'power': 16.4}, random_state=90, verbose=2)
    #print(embedding_function(
    #    common_info['letter_samples'][cur_shown_letter_indices_begin:cur_shown_letter_indices_end], verbose=2))
    #print("\n\n\n")
    #print(letters_y_lion90[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, :])


    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    point_size_gray = 10
    point_size_interest = 15

    plt.figure(dpi=300)
    plt.gcf().set_size_inches(6.8, 6.8)

    font_properties = FontProperties()
    font_properties.set_family('serif')
    font_properties.set_name('Times New Roman')
    font_properties.set_size(8)

    plt.scatter(Y_mnist[:, 0], Y_mnist[:, 1], c= 'gray', zorder=1, label=None, marker='.',
                              s = point_size_gray)
    h1 = plt.scatter(letters_y_lion90[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 0],
                    letters_y_lion90[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 1],
                     c='red', zorder=1, label=None, marker='.', s = point_size_interest)
    h2 = plt.scatter(letters_y_lion95[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 0],
                    letters_y_lion95[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 1],
                     c='blue', zorder=1, label=None, marker='.', s = point_size_interest)
    h3 = plt.scatter(letters_y_lion99[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 0],
                    letters_y_lion99[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 1],
                     c='green', zorder=1, label=None, marker='.', s = point_size_interest)
    h4 = plt.scatter(letters_y_lion100[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 0],
                    letters_y_lion100[cur_shown_letter_indices_begin:cur_shown_letter_indices_end, 1],
                     c='purple', zorder=1, label=None, marker='.', s = point_size_interest)
    plt.legend([h1,h2,h3,h4], lion_method_list, ncol=1, prop=font_properties, borderpad=0.1,handlelength=2,
                           columnspacing = 0, loc = 1, handletextpad=-0.7,frameon=True)
    plt.show()
def train_or_load_models(regenerate_model1=False, regenerate_model2=False, regenerate_model3=False,
         parameters=settings.parameters):
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    keras_random_seed = parameters.get("keras_random_seed", settings.parameters["keras_random_seed"])

    # Reproducibility: parallel threads can bring uncontrolled randomness
    # Luckily, models here are small, no need for parallel threads etc.
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    session = tf.Session(config=session_conf)
    tf.keras.backend.set_session(session)

    model1_weights_file_prefix = '../results/model1'
    model1_json_file_prefix = '../results/model1'
    model2_weights_file_prefix = '../results/model2'
    model2_json_file_prefix = '../results/model2'
    model3_weights_file_prefix = '../results/model3'
    model3_json_file_prefix = '../results/model3'

    model1_weights_file = model1_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                      postfix='.hd5')
    model1_json_file = model1_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                postfix='.json')
    model2_weights_file = model2_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                      postfix='.hd5')
    model2_json_file = model2_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                postfix='.json')
    model3_weights_file = model3_weights_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                      postfix='.hd5')
    model3_json_file = model3_json_file_prefix + generate_data.combine_prefixes(nn_model_prefixes, parameters,
                                                                                postfix='.json')

    if not os.path.isfile(model1_weights_file) or regenerate_model1:
        # 2 layers, 250 nodes per layer, ReLu activation, dropout regularization with rate of 0.25.]

        set_all_random_seeds(keras_random_seed)
        model1 = keras.models.Sequential()
        model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model1.add(keras.layers.Dropout(0.25))
        model1.add(keras.layers.Dense(250, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model1.add(keras.layers.Dropout(0.25))
        model1.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal'))
        model1.compile(loss='mean_squared_error', optimizer='adam')
        model1.fit(X_mnist, Y_mnist,
                   epochs=5000,
                   verbose=1,
                   validation_data=(X_mnist, Y_mnist))
        with open(model1_json_file, "w") as f:
            f.write(model1.to_json())
        model1.save_weights(model1_weights_file)
    else:
        with open(model1_json_file, "r") as f:
            model1 = keras.models.model_from_json(f.read())
        model1.load_weights(model1_weights_file)
        model1.compile(loss='mean_squared_error', optimizer='adam')

    Y_nn1_mnist = model1.predict(X_mnist)

    if not os.path.isfile(model2_weights_file) or regenerate_model2:
        # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.]
        set_all_random_seeds(keras_random_seed)
        model2 = keras.models.Sequential()
        model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model2.add(keras.layers.Dropout(0.5))
        model2.add(keras.layers.Dense(500, activation='relu', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model2.add(keras.layers.Dropout(0.5))
        model2.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal'))
        model2.compile(loss='mean_squared_error', optimizer='adam')
        model2.fit(X_mnist, Y_mnist,
                   epochs=5000,
                   verbose=1,
                   validation_data=(X_mnist, Y_mnist))
        with open(model2_json_file, "w") as f:
            f.write(model2.to_json())
        model2.save_weights(model2_weights_file)
    else:
        with open(model2_json_file, "r") as f:
            model2 = keras.models.model_from_json(f.read())
        model2.load_weights(model2_weights_file)
        model2.compile(loss='mean_squared_error', optimizer='adam')

    Y_nn2_mnist = model2.predict(X_mnist)

    if not os.path.isfile(model3_weights_file) or regenerate_model3:
        # 2 layers, 500 nodes per layer, ReLu activation, dropout regularization with rate of 0.5.]
        set_all_random_seeds(keras_random_seed)
        model3 = keras.models.Sequential()
        model3.add(keras.layers.Dense(500, activation='tanh', kernel_initializer='normal', input_dim=X_mnist.shape[1]))
        model3.add(keras.layers.Dense(Y_mnist.shape[1], kernel_initializer='normal'))
        model3.compile(loss='mean_squared_error', optimizer='adam')
        model3.fit(X_mnist, Y_mnist,
                   epochs=5000,
                   verbose=1,
                   validation_data=(X_mnist, Y_mnist))
        with open(model3_json_file, "w") as f:
            f.write(model3.to_json())
        model3.save_weights(model3_weights_file)
    else:
        with open(model3_json_file, "r") as f:
            model3 = keras.models.model_from_json(f.read())
        model3.load_weights(model3_weights_file)
        model3.compile(loss='mean_squared_error', optimizer='adam')

    Y_nn3_mnist = model3.predict(X_mnist)
    return {"models" : (model1, model2, model3), "Y_predicted" : (Y_nn1_mnist, Y_nn2_mnist, Y_nn3_mnist)}
def main(regenerate, only_time):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist= generate_data.load_y_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(parameters=parameters)

    # Doing it from scratch takes REALLY long time. If possible, save results & pre-load

    # These are consequences of parallelization
    # input_files = ['gd_results' + str(100 * i) + '_' + str(100 * i + 100) + '.p' for i in range(10)]
    output_file = generate_cluster_results_filename(parameters)
    output_time_file = generate_time_results_filename(parameters)

    first_sample_inc = 0  # Change only if it is one of "Other notebooks just for parallelization"
    last_sample_exclusive = len(picked_neighbors)

    if os.path.isfile(output_file) and not regenerate:
        logging.info("Found previous partially completed test. Starting from there.")
        with open(output_file, 'rb') as f:
            (picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed,
             picked_neighbors_y_gd_transformed_random,
             picked_neighbors_y_gd_variance_recalc_transformed_random,
             picked_neighbors_y_gd_early_exagg_transformed_random,
             picked_neighbors_y_gd_early_exagg_transformed,
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
             picked_random_starting_positions,
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f)
        with open(output_time_file, 'rb') as f:
            (picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed,
             picked_neighbors_y_time_gd_transformed_random,
             picked_neighbors_y_time_gd_variance_recalc_transformed_random,
             picked_neighbors_y_time_gd_early_exagg_transformed_random,
             picked_neighbors_y_time_gd_early_exagg_transformed,
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random,
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples) = pickle.load(f)
    else:
        logging.info("No previous partially completed test, or regeneration requested. Starting from scratch.")
        covered_samples = list()

        # Let's build all possible combinations. Later we'll decide what to plot
        picked_neighbors_y_gd_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))

        picked_neighbors_y_gd_early_exagg_transformed_random = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_early_exagg_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random = np.zeros(
             (len(picked_neighbors), Y_mnist.shape[1]))
        picked_neighbors_y_gd_variance_recalc_early_exagg_transformed = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))

        picked_neighbors_y_time_gd_transformed = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_transformed = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_transformed_random = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_transformed_random = np.zeros((len(picked_neighbors), ))

        picked_neighbors_y_time_gd_early_exagg_transformed_random = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_early_exagg_transformed = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random = np.zeros((len(picked_neighbors), ))
        picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed = np.zeros((len(picked_neighbors), ))

        picked_random_starting_positions = np.zeros((len(picked_neighbors), Y_mnist.shape[1]))

    for i in range(first_sample_inc, last_sample_exclusive):
         np.random.seed(i)  # We reset random seed every time. Otherwise, if you load partial results from file, everything
         # will depend on which parts were loaded, random sequence will "shift" depend on that, and reproducibility will be lost.
         # I.e. if put seed(0) before the loop and start from scratch, then you will have some random sequence [abc] for sample 0,
         # other (continuation of that sequence) [def] for sample 1, etc. But if you already loaded sample 0 from file, you will
         # have [abc] for sample 1, [def] for sample 2, etc. Reproducibility should not depend on what parts are loaded.
         # Hence, random seed every time, and it depends on ABSOLUTE sample number.
         logging.info(" ====================== Sample %d\n\n", i)
         if i in covered_samples and not regenerate:
             logging.info("Already loaded.")
         else:
             neighbor = picked_neighbors[i].reshape((1, -1))

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_transformed[i, :] = dTSNE_mnist.transform(neighbor, y='closest',
                                                                             verbose=2,
                                                                             optimizer_kwargs={'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_transformed[i] = (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time: %f s", picked_neighbors_y_time_gd_transformed[i])
             

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_transformed[i, :] = dTSNE_mnist.transform(neighbor, keep_sigmas=False,
                      y='closest',
                      verbose=2, optimizer_kwargs={'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_transformed[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR): %f s", picked_neighbors_y_time_gd_variance_recalc_transformed[i])

             # Let's pick random starts at any point. not necessary near the center.
             y_start = np.array([[
                 np.random.uniform(np.min(Y_mnist[:, 0]), np.max(Y_mnist[:, 0])),
                 np.random.uniform(np.min(Y_mnist[:, 1]), np.max(Y_mnist[:, 1]))
             ]])

             picked_random_starting_positions[i, :] = y_start

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_transformed_random[i, :] = dTSNE_mnist.transform(neighbor, y=y_start,  # y='random',
                    verbose=2, optimizer_kwargs={
                     'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (random): %f s", picked_neighbors_y_time_gd_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_transformed_random[i, :] = dTSNE_mnist.transform(neighbor,
                    keep_sigmas=False,
                    y=y_start,  # y='random',
                    verbose=2,
                    optimizer_kwargs={
                        'early_exaggeration': None})
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR,random): %f s", picked_neighbors_y_time_gd_variance_recalc_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_early_exagg_transformed_random[i, :] = dTSNE_mnist.transform(neighbor, y=y_start,
                                                                                                # y='random',
                                                                                                verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_early_exagg_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (EE,random): %f s", picked_neighbors_y_time_gd_early_exagg_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_early_exagg_transformed[i, :] = dTSNE_mnist.transform(neighbor, y='closest', verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_early_exagg_transformed[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (EE): %f s", picked_neighbors_y_time_gd_early_exagg_transformed[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random[i, :] = \
                 dTSNE_mnist.transform(neighbor,
                                        y=y_start,
                                        keep_sigmas=False,
                                        verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR, EE, random): %f s",
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random[i])

             embedder_start_time = datetime.datetime.now()
             picked_neighbors_y_gd_variance_recalc_early_exagg_transformed[i, :] = dTSNE_mnist.transform(neighbor,
                   keep_sigmas=False,y='closest',verbose=2)
             embedder_end_time = datetime.datetime.now()
             picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed[i] = \
                 (embedder_end_time - embedder_start_time).total_seconds()
             logging.info("Time (VR, EE): %f s",
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed[i])

             covered_samples.append(i)
         # Re-saving even if it is a loaded sample
         logging.info("Saving...")
         # Gradient descent results take a long while. Let's cache.
         if not only_time:
            with open(output_file, 'wb') as f:
                pickle.dump((picked_neighbors_y_gd_transformed, picked_neighbors_y_gd_variance_recalc_transformed,
                          picked_neighbors_y_gd_transformed_random, picked_neighbors_y_gd_variance_recalc_transformed_random,
                          picked_neighbors_y_gd_early_exagg_transformed_random,
                          picked_neighbors_y_gd_early_exagg_transformed,
                          picked_neighbors_y_gd_variance_recalc_early_exagg_transformed_random,
                          picked_random_starting_positions,
                          picked_neighbors_y_gd_variance_recalc_early_exagg_transformed, covered_samples), f)
         with open(output_time_file, 'wb') as f:
             pickle.dump((picked_neighbors_y_time_gd_transformed, picked_neighbors_y_time_gd_variance_recalc_transformed,
                          picked_neighbors_y_time_gd_transformed_random,
                          picked_neighbors_y_time_gd_variance_recalc_transformed_random,
                          picked_neighbors_y_time_gd_early_exagg_transformed_random,
                          picked_neighbors_y_time_gd_early_exagg_transformed,
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed_random,
                          picked_neighbors_y_time_gd_variance_recalc_early_exagg_transformed, covered_samples), f)
Exemplo n.º 17
0
def main(parameters=settings.parameters, regenerate_parameters_cache=False):
    step = 0.01
    choice_K = np.arange(step, 2 + step, step)  # Let's try those K.

    logging.info("Started loading.")
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)
    picked_neighbors = generate_data.load_picked_neighbors(
        parameters=parameters)
    picked_neighbor_labels = generate_data.load_picked_neighbors_labels(
        parameters=parameters)
    accuracy_nn = parameters.get("accuracy_nn",
                                 settings.parameters["accuracy_nn"])
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])
    labels_mnist = generate_data.load_labels_mnist(parameters=parameters)
    baseline_accuracy = generate_data.get_baseline_accuracy(
        parameters=parameters)
    logging.info("Loaded everything.")

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    def get_nearest_neighbors_in_y(y, Y_mnist, n=10):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        return np.argsort(y_distances)[:n]

    # Implementing carefully. Not the fastest, but the most reliable way.

    kernel_tsne_mapping = kernelized_tsne.generate_kernelized_tsne_mapping_function(
        parameters=parameters,
        regenerate_parameters_cache=regenerate_parameters_cache)

    kernelized_detailed_tsne_method_list = [
        "Kernelized tSNE; K=%.2f" % (k) for k in choice_K
    ]
    kernelized_detailed_tsne_method_results = list()

    kernelized_detailed_tsne_accuracy = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))
    kernelized_detailed_tsne_precision = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))
    kernelized_detailed_tsne_time = np.zeros(
        (len(kernelized_detailed_tsne_method_list), ))

    for j in range(len(choice_K)):
        k = choice_K[j]
        logging.info("%f", k)

        embedder_start_time = datetime.datetime.now()
        kernelized_detailed_tsne_method_results.append(
            kernel_tsne_mapping(picked_neighbors, k=k))
        embedder_end_time = datetime.datetime.now()
        kernelized_detailed_tsne_time[j] = (
            embedder_end_time - embedder_start_time).total_seconds()
        logging.info("%f complete", k)
        #kernelized_detailed_tsne_method_results = [kernel_tsne_mapping(picked_neighbors, k=k) for k in choice_K]

        logging.info("%s", kernelized_detailed_tsne_method_list[j])
        per_sample_accuracy = np.zeros((len(picked_neighbors), ))
        per_sample_precision = np.zeros((len(picked_neighbors), ))
        for i in range(len(picked_neighbors)):
            if i % 200 == 0:
                logging.info("%d", i)
            expected_label = picked_neighbor_labels[i]
            y = kernelized_detailed_tsne_method_results[j][i, :]
            x = picked_neighbors[i, :]
            nn_x_indices = get_nearest_neighbors_in_y(x,
                                                      X_mnist,
                                                      n=precision_nn)
            nn_y_indices = get_nearest_neighbors_in_y(y,
                                                      Y_mnist,
                                                      n=precision_nn)
            matching_indices = len(
                [k for k in nn_x_indices if k in nn_y_indices])
            per_sample_precision[i] = (matching_indices / precision_nn)

            kernelized_indices = get_nearest_neighbors_in_y(
                kernelized_detailed_tsne_method_results[j][i, :],
                Y_mnist,
                n=accuracy_nn)
            obtained_labels = labels_mnist[kernelized_indices]
            per_sample_accuracy[i] = sum(
                obtained_labels == expected_label) / len(obtained_labels)
        kernelized_detailed_tsne_accuracy[j] = np.mean(per_sample_accuracy)
        kernelized_detailed_tsne_precision[j] = np.mean(per_sample_precision)
        logging.info("%s :\t%f\t%f\t%f s",
                     kernelized_detailed_tsne_method_list[j],
                     kernelized_detailed_tsne_precision[j],
                     kernelized_detailed_tsne_accuracy[j],
                     kernelized_detailed_tsne_time[j])

    # Accuracy-vs-power plot
    legend_list = list()
    f, ax = plt.subplots()
    f.set_size_inches(6, 3)
    x = [k for k in choice_K]  # Ensuring order
    y = kernelized_detailed_tsne_accuracy
    # plt.title("IDW - Accuracy vs Power") # We'd better use figure caption
    # ax.legend([h1,h2,h3,h4,h5,h6], ["Closest Training Set Image"]+idw_method_list)
    plt.plot(x, y, c='blue')
    h = plt.axhline(y=baseline_accuracy, c='black', linestyle='--')
    plt.legend([h], ["Baseline Accuracy (%.4f)" % baseline_accuracy])
    plt.xlabel("Kernelized tSNE: K parameter")
    plt.ylabel("10-NN Accuracy")
    plt.ylim([0, 1])
    plt.xlim([0, 2])
    f.tight_layout()
    plt.savefig("../figures/kernelized-tsne-K-vs-accuracy.png")

    ind = [4, 24, 49]
    kernelized_tsne_method_list = [
        kernelized_detailed_tsne_method_list[i][:10] +
        kernelized_detailed_tsne_method_list[i][-8:] for i in ind
    ]
    kernelized_tsne_method_results = [
        kernelized_detailed_tsne_method_results[i] for i in ind
    ]

    kernelized_tsne_nearest_neighbors_percentiles_matrix = np.zeros(
        (len(picked_neighbors), len(kernelized_tsne_method_list)))
    for i in range(len(picked_neighbors)):
        for j in range(len(kernelized_tsne_method_list)):
            y = kernelized_tsne_method_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            kernelized_tsne_nearest_neighbors_percentiles_matrix[
                i, j] = stats.percentileofscore(nearest_neighbors_y_dist,
                                                nn_dist)
    kernelized_tsne_distance_percentiles = np.mean(
        kernelized_tsne_nearest_neighbors_percentiles_matrix, axis=0)
    for j in range(len(kernelized_tsne_method_list)):
        print(kernelized_tsne_method_list[j],
              kernelized_tsne_distance_percentiles[j])

    output_file = generate_cluster_results_filename(parameters)
    with open(output_file, 'wb') as f:
        pickle.dump(
            (kernelized_detailed_tsne_method_results,
             kernelized_detailed_tsne_accuracy,
             kernelized_detailed_tsne_precision, kernelized_detailed_tsne_time,
             kernelized_detailed_tsne_method_list), f)
def main(*, regenerate=False, parameters=settings.parameters):
    start_time = datetime.datetime.now()
    logging.info("IDW/RBF/LION letter_A experiment started: %s", start_time)

    common_info = get_common_info(parameters)
    results = dict()
    embedders = generate_all_embedders(common_info['dTSNE_mnist'])

    for embedder_name in embedders.keys():
        process_single_embedder(embedder=embedders[embedder_name],
                                embedder_name=embedder_name,
                                results=results,
                                regenerate=regenerate,
                                common_info=common_info,
                                parameters=parameters)

    end_time = datetime.datetime.now()
    logging.info("letter_A experiment ended: %s", end_time)
    logging.info("letter_A experiment duration: %s", end_time - start_time)

    _, _, lion_optimal_power = exp_lion_power_performance.load_lion_power_plot(
    )
    lion_method_list = [
        "LION; $r_x$ at %dth perc.; $p$=%.1f" % (i, lion_optimal_power[i])
        for i in sorted(lion_optimal_power)
    ]

    lion90_name = [i for i in results.keys() if i.startswith('LION-90')][0]
    letter_As_y_lion90 = results[lion90_name]['EmbeddedPoints']
    lion95_name = [i for i in results.keys() if i.startswith('LION-95')][0]
    letter_As_y_lion95 = results[lion95_name]['EmbeddedPoints']
    lion99_name = [i for i in results.keys() if i.startswith('LION-99')][0]
    letter_As_y_lion99 = results[lion99_name]['EmbeddedPoints']
    lion100_name = [i for i in results.keys() if i.startswith('LION-100')][0]
    letter_As_y_lion100 = results[lion100_name]['EmbeddedPoints']

    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    point_size_gray = 10
    cur_shown_letter_A_indices = 100
    point_size_interest = 15

    plt.figure(dpi=300)
    plt.gcf().set_size_inches(6.8, 6.8)

    font_properties = FontProperties()
    font_properties.set_family('serif')
    font_properties.set_name('Times New Roman')
    font_properties.set_size(8)

    plt.scatter(Y_mnist[:, 0],
                Y_mnist[:, 1],
                c='gray',
                zorder=1,
                label=None,
                marker='.',
                s=point_size_gray)
    h1 = plt.scatter(letter_As_y_lion90[:cur_shown_letter_A_indices, 0],
                     letter_As_y_lion90[:cur_shown_letter_A_indices, 1],
                     c='red',
                     zorder=1,
                     label=None,
                     marker='.',
                     s=point_size_interest)
    h2 = plt.scatter(letter_As_y_lion95[:cur_shown_letter_A_indices, 0],
                     letter_As_y_lion95[:cur_shown_letter_A_indices, 1],
                     c='blue',
                     zorder=1,
                     label=None,
                     marker='.',
                     s=point_size_interest)
    h3 = plt.scatter(letter_As_y_lion99[:cur_shown_letter_A_indices, 0],
                     letter_As_y_lion99[:cur_shown_letter_A_indices, 1],
                     c='green',
                     zorder=1,
                     label=None,
                     marker='.',
                     s=point_size_interest)
    h4 = plt.scatter(letter_As_y_lion100[:cur_shown_letter_A_indices, 0],
                     letter_As_y_lion100[:cur_shown_letter_A_indices, 1],
                     c='purple',
                     zorder=1,
                     label=None,
                     marker='.',
                     s=point_size_interest)
    plt.legend([h1, h2, h3, h4],
               lion_method_list,
               ncol=1,
               prop=font_properties,
               borderpad=0.1,
               handlelength=2,
               columnspacing=0,
               loc=1,
               handletextpad=-0.7,
               frameon=True)
    plt.show()
import matplotlib.pyplot as plt
import generate_data
from matplotlib.font_manager import FontProperties

labels_mnist = generate_data.load_labels_mnist()
Y_mnist = generate_data.load_y_mnist()

plt.figure(dpi=300)
font_properties = FontProperties()
font_properties.set_family('serif')
font_properties.set_name('Times New Roman')
font_properties.set_size(9)

plt.xlim([-180, 180])
plt.ylim([-150, 170])

plt.gcf().set_size_inches(
    2.5, 2.1)  #Let's set the plot sizes that just fit paper margins
legend_list = list()
for l in set(sorted(labels_mnist)):
    plt.scatter(Y_mnist[labels_mnist == l, 0],
                Y_mnist[labels_mnist == l, 1],
                marker='.',
                s=5)
    legend_list.append(str(l))
#plt.title("MNIST Dataset - TSNE visualization")
#plt.tight_layout()

l = plt.legend(legend_list,
               bbox_to_anchor=(0.99, 1.025),
               markerscale=8,
def generate_idw_power_performance(*,
                                   regenerate=False,
                                   recursive_regenerate=False,
                                   parameters=settings.parameters):
    global_idw_precision_by_y = dict()
    global_idw_precision_by_x = dict()

    start_time = datetime.datetime.now()
    logging.info("IDW internal precision power experiment started: %s",
                 start_time)
    idw_power_performance_file = generate_idw_power_filename(parameters)
    idw_power_plot_file = generate_idw_power_plot_filename(parameters)

    X_mnist = generate_data.load_x_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in idw_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)

    if os.path.isfile(idw_power_performance_file) and not regenerate:
        with open(idw_power_performance_file, 'rb') as f:
            global_idw_precision_by_x, global_idw_precision_by_y = pickle.load(
                f)
    else:
        logging.info("Regeneration requested")

    for p in idw_power_options:
        if p in global_idw_precision_by_x:
            logging.info("Loaded p %f %f %f", p, global_idw_precision_by_x[p],
                         global_idw_precision_by_y[p])
            continue

        logging.info("Processing p %f", p)

        per_sample_precision_x = list()
        per_sample_precision_y = list()
        for i in range(len(X_mnist)):
            distances = distance_matrix[i, :].copy()
            # distances[i] = np.inf #Not interested in distance to itself
            # Step 1. Find nearest neighbors in the neighborhood.
            neighbor_indices = list(range(X_mnist.shape[0]))
            neighbor_indices.remove(i)
            weights = 1 / distances[neighbor_indices]**p
            weights = weights / np.sum(weights)
            cur_y_result = weights.dot(Y_mnist[neighbor_indices, :])

            nn_xreal_indices = get_nearest_neighbors(X_mnist[i, :],
                                                     X_mnist,
                                                     n=precision_nn,
                                                     exclude_index=i)
            nn_yreal_indices = get_nearest_neighbors(Y_mnist[i, :],
                                                     Y_mnist,
                                                     n=precision_nn,
                                                     exclude_index=i)
            nn_yembedded_indices = get_nearest_neighbors(cur_y_result,
                                                         Y_mnist,
                                                         n=precision_nn,
                                                         exclude_index=i)
            matching_indices_xreal_yembedded = len(
                [j for j in nn_xreal_indices if j in nn_yembedded_indices])
            matching_indices_yreal_yembedded = len(
                [j for j in nn_yreal_indices if j in nn_yembedded_indices])
            per_sample_precision_x.append(matching_indices_xreal_yembedded /
                                          precision_nn)
            per_sample_precision_y.append(matching_indices_yreal_yembedded /
                                          precision_nn)

        global_idw_precision_by_x[p] = np.mean(per_sample_precision_x)
        global_idw_precision_by_y[p] = np.mean(per_sample_precision_y)

        # Just in case it will become unstable due to too few neighbors
        # lion_power_plot_data[(p, perc)]['PowerSquareDistSum'] = y_sum_square_dist
        # lion_power_plot_data[(p, perc)]['PowerSquareDistCount'] = y_count

        with open(idw_power_performance_file, 'wb') as f:
            pickle.dump((global_idw_precision_by_x, global_idw_precision_by_y),
                        f)

    EPS = 1e-5
    y = list()
    x_global = list()
    for cur_power in idw_power_options:
        closest_power = [
            i for i in global_idw_precision_by_x if np.abs(i - cur_power) < EPS
        ]
        if len(closest_power) > 0:
            x_global.append(cur_power)
            y.append(global_idw_precision_by_x[closest_power[0]])
    idw_optimal_power_precision_by_x = x_global[np.argmax(y)]
    precision_plot_by_x = y

    EPS = 1e-5
    y = list()
    x_global = list()
    for cur_power in idw_power_options:
        closest_power = [
            i for i in global_idw_precision_by_y if np.abs(i - cur_power) < EPS
        ]
        if len(closest_power) > 0:
            x_global.append(cur_power)
            y.append(global_idw_precision_by_y[closest_power[0]])
    idw_optimal_power_precision_by_y = x_global[np.argmax(y)]
    precision_plot_by_y = y

    with open(idw_power_plot_file, 'wb') as f:
        pickle.dump((x_global, precision_plot_by_x, precision_plot_by_y,
                     idw_optimal_power_precision_by_x,
                     idw_optimal_power_precision_by_y), f)
    logging.info("IDW optimal power (precision by X): %f",
                 idw_optimal_power_precision_by_x)
    logging.info("IDW optimal power (precision by Y): %f",
                 idw_optimal_power_precision_by_y)

    end_time = datetime.datetime.now()
    logging.info("IDW internal precision power experiment ended: %s", end_time)
    logging.info("IDW internal precision power experiment duration: %s",
                 end_time - start_time)
Exemplo n.º 21
0
def main(parameters=settings.parameters, regenerate=False):
    dTSNE_mnist = generate_data.load_dtsne_mnist(parameters=parameters)
    Y_mnist = generate_data.load_y_mnist(parameters=parameters)
    letter_A_samples, _ = generate_data.load_A_letters(parameters=parameters)
    X_mnist = generate_data.load_x_mnist(parameters=parameters)

    D_Y = distance.squareform(distance.pdist(Y_mnist))
    # Now find distance to closest neighbor
    np.fill_diagonal(D_Y, np.inf)  # ... but not to itself
    nearest_neighbors_y_dist = np.min(D_Y, axis=1)  # Actually, whatever axis

    kernelized_results_file = exp_letter_A_test_kernelized.generate_letter_A_results_filename(
        parameters)
    with open(kernelized_results_file, 'rb') as f:
        kernelized_detailed_method_results, kernelized_detailed_tsne_time, kernelized_detailed_method_list = pickle.load(
            f)
    ind = [4, 24, 49]

    kernelized_method_list = [
        kernelized_detailed_method_list[i][:10] +
        kernelized_detailed_method_list[i][-8:] for i in ind
    ]
    kernelized_letters_results = [
        kernelized_detailed_method_results[i] for i in ind
    ]

    # =========== DISTANCE PERCENTILES ==========
    kernelized_letters_percentiles_matrix = np.zeros(
        (len(letter_A_samples), len(kernelized_method_list)))
    kernelized_letters_distance_matrix = np.zeros(
        (len(letter_A_samples), len(kernelized_method_list)))
    for i in range(len(letter_A_samples)):
        for j in range(len(kernelized_method_list)):
            y = kernelized_letters_results[j][i, :]
            nn_dist = np.min(np.sqrt(np.sum((Y_mnist - y)**2, axis=1)))
            kernelized_letters_distance_matrix[i, j] = nn_dist
            kernelized_letters_percentiles_matrix[i,
                                                  j] = stats.percentileofscore(
                                                      nearest_neighbors_y_dist,
                                                      nn_dist)
    kernelized_letters_distance_percentiles = np.mean(
        kernelized_letters_percentiles_matrix, axis=0)
    kernelized_letters_distances = np.mean(kernelized_letters_distance_matrix,
                                           axis=0)
    kernelized_per_item_time = kernelized_detailed_tsne_time / len(
        letter_A_samples)
    for j in range(len(kernelized_method_list)):
        logging.info("%s: %f, %f", kernelized_method_list[j],
                     kernelized_letters_distances[j],
                     kernelized_letters_distance_percentiles[j])

    kernelized_letters_kl = np.zeros(
        (len(kernelized_method_list), len(letter_A_samples)))
    processed_indices = list()

    kl_kernelized_tsne_letters_performance_file = generate_kernelized_kl_temp_filename(
        parameters)
    if os.path.isfile(
            kl_kernelized_tsne_letters_performance_file) and not regenerate:
        with open(kl_kernelized_tsne_letters_performance_file, 'rb') as f:
            kernelized_letters_kl, processed_indices = pickle.load(f)

    # =========== KL DIVERGENCE ==========
    # KL divergence increase for all 1000 samples is very slow to calculate. Main part of that is calculating P-matrix.
    per_sample_KL = np.zeros((len(letter_A_samples), ))
    for i in range(len(letter_A_samples)):
        if i in processed_indices:
            logging.info("Sample %d already processed. Results loaded.", i)
            continue
        logging.info("Processing sample %d", i)
        distance_matrix_dir = distance_matrix_dir_prefix + generate_data.combine_prefixes(
            settings.tsne_parameter_set | settings.letter_A_parameter_set,
            parameters, os.sep)
        distance_matrix_file = distance_matrix_dir + 'item' + str(j) + '.p'
        # Make sure you can load them one-by-one.
        if os.path.isfile(distance_matrix_file):
            logging.info("\tP-matrix file found. Loading.")
            with open(distance_matrix_file, 'rb') as f:
                new_P, _ = pickle.load(f)
        else:
            logging.info("\tP-matrix file not found. Creating and saving.")
            new_X = np.concatenate((X_mnist, letter_A_samples[i, :].reshape(
                (1, -1))),
                                   axis=0)
            new_D = distance.squareform(distance.pdist(new_X))
            new_P, new_sigmas = lion_tsne.get_p_and_sigma(
                distance_matrix=new_D, perplexity=dTSNE_mnist.perplexity)
            with open(distance_matrix_file, 'wb') as f:
                pickle.dump((new_P, new_sigmas), f)
        # For all of methods P-matrix is shared.
        for j in range(len(kernelized_letters_results)):
            # Single file with p matrix
            new_Y = np.concatenate(
                (Y_mnist, kernelized_letters_results[j][i, :].reshape(
                    (1, -1))),
                axis=0)
            kernelized_letters_kl[j,
                                  i], _ = lion_tsne.kl_divergence_and_gradient(
                                      p_matrix=new_P, y=new_Y)
        processed_indices.append(i)
        with open(kl_kernelized_tsne_letters_performance_file, 'wb') as f:
            pickle.dump((kernelized_letters_kl, processed_indices), f)
    # This should be fast
    kernelized_avg_letters_kl = np.mean(kernelized_letters_kl, axis=1)

    output_file = generate_kernelized_postprocess_filename(parameters)
    with open(output_file, "wb") as f:
        pickle.dump((kernelized_method_list, kernelized_avg_letters_kl,
                     kernelized_per_item_time,
                     kernelized_letters_distance_percentiles), f)
def generate_lion_power_performance(*,
                                    regenerate=False,
                                    recursive_regenerate=False,
                                    parameters=settings.parameters):
    start_time = datetime.datetime.now()
    logging.info("LION power internal precision experiment started: %s",
                 start_time)

    precision_nn = parameters.get("precision_nn",
                                  settings.parameters["precision_nn"])

    lion_power_performance_data_file = generate_lion_power_performance_filename(
        parameters)
    lion_power_plot_data_file = generate_lion_power_plot_filename(parameters)

    lion_power_performance_data = dict()  # Start from scratch

    X_mnist = generate_data.load_x_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)
    Y_mnist = generate_data.load_y_mnist(
        parameters=settings.parameters,
        regenerate=recursive_regenerate,
        recursive_regenerate=recursive_regenerate)

    def get_nearest_neighbors(y, Y_mnist, n, exclude_index):
        y_distances = np.sum((Y_mnist - y)**2, axis=1)
        y_distances[exclude_index] = np.inf
        return np.argsort(y_distances)[:n]

    distance_matrix = distance.squareform(distance.pdist(X_mnist))
    np.fill_diagonal(distance_matrix,
                     np.inf)  # We are not interested in distance to itself
    nn_x_distance = np.min(distance_matrix, axis=1)  # Any axis will do
    radius_x = dict()
    for p in lion_percentile_options:
        radius_x[p] = np.percentile(nn_x_distance, p)
    logging.info("Radius X: %s", radius_x)

    if os.path.isfile(lion_power_performance_data_file) and not regenerate:
        with open(lion_power_performance_data_file, 'rb') as f:
            lion_power_performance_data = pickle.load(f)

    for perc in lion_percentile_options:
        for p in lion_power_options:
            logging.info("Processing percentile and power: %f, %d", p, perc)
            key = str(perc) + ";" + "%.3f" % (p)
            logging.info("Key: %s", key)
            if key not in lion_power_performance_data:
                lion_power_performance_data[key] = dict()

            if 'InternalPrecisionByX' not in lion_power_performance_data[
                    key] or regenerate:
                logging.info(
                    "Power performance not found for power %f percentile %d.\tCalculating...",
                    p, perc)

                per_sample_precision_x = list()
                per_sample_precision_y = list()
                for i in range(len(X_mnist)):
                    distances = distance_matrix[i, :].copy()
                    distances[
                        i] = np.inf  # Not interested in distance to itself
                    # Step 1. Find nearest neighbors in the neighborhood.
                    neighbor_indices = np.where(distances <= radius_x[perc])[0]
                    num_neighbors = len(neighbor_indices)
                    if num_neighbors >= 2:  # Below 2? Cannot interpolate
                        # We are good
                        weights = 1 / distances[neighbor_indices]**p
                        weights = weights / np.sum(weights)
                        cur_y_result = weights.dot(
                            Y_mnist[neighbor_indices, :])

                        nn_xreal_indices = get_nearest_neighbors(
                            X_mnist[i, :],
                            X_mnist,
                            n=precision_nn,
                            exclude_index=i)
                        nn_yreal_indices = get_nearest_neighbors(
                            Y_mnist[i, :],
                            Y_mnist,
                            n=precision_nn,
                            exclude_index=i)
                        nn_yembedded_indices = get_nearest_neighbors(
                            cur_y_result,
                            Y_mnist,
                            n=precision_nn,
                            exclude_index=i)
                        matching_indices_xreal_yembedded = len([
                            j for j in nn_xreal_indices
                            if j in nn_yembedded_indices
                        ])
                        matching_indices_yreal_yembedded = len([
                            j for j in nn_yreal_indices
                            if j in nn_yembedded_indices
                        ])
                        per_sample_precision_x.append(
                            matching_indices_xreal_yembedded / precision_nn)
                        per_sample_precision_y.append(
                            matching_indices_yreal_yembedded / precision_nn)

                new_dict = dict()
                new_dict['InternalPrecisionByX'] = np.mean(
                    per_sample_precision_x)
                new_dict['InternalPrecisionByY'] = np.mean(
                    per_sample_precision_y)

                for ndk in new_dict.keys():
                    lion_power_performance_data[key][ndk] = new_dict[ndk]

                with open(lion_power_performance_data_file, 'wb') as f:
                    pickle.dump(lion_power_performance_data, f)
            else:
                logging.info(
                    "Power FOUND for power %f percentile %d. Using loaded.", p,
                    perc)

            logging.info("%s %s", key, lion_power_performance_data[key])

    lion_optimal_power_x = dict()
    lion_power_plot_x = dict()
    for perc in lion_percentile_options:
        y = list()
        for cur_power in lion_power_options:
            key = str(perc) + ";%.3f" % (cur_power)
            # print(cur_power, perc, lion_power_plot_data[key])
            y.append(lion_power_performance_data[key]['InternalPrecisionByX'])
        lion_power_plot_x[perc] = y
        lion_optimal_power_x[perc] = lion_power_options[np.argmax(y)]

    lion_optimal_power_y = dict()
    lion_power_plot_y = dict()
    for perc in lion_percentile_options:
        y = list()
        for cur_power in lion_power_options:
            key = str(perc) + ";%.3f" % (cur_power)
            # print(cur_power, perc, lion_power_plot_data[key])
            y.append(lion_power_performance_data[key]['InternalPrecisionByY'])
        lion_power_plot_y[perc] = y
        lion_optimal_power_y[perc] = lion_power_options[np.argmax(y)]

    with open(lion_power_plot_data_file, 'wb') as f:
        pickle.dump(
            (lion_power_options, lion_power_plot_y, lion_optimal_power_y,
             lion_power_plot_x, lion_optimal_power_x), f)
    logging.info("LION optimal power X: %s", lion_optimal_power_x)
    logging.info("LION optimal power Y: %s", lion_optimal_power_y)

    end_time = datetime.datetime.now()
    logging.info("LION power experiment ended: %s", end_time)
    logging.info("LION power experiment duration: %s", end_time - start_time)