예제 #1
0
def generate_y_mnist(*,
                     parameters=settings.parameters,
                     recursive_regenerate=False):
    """

    PREDECESSOR: X_mnist

    :param parameters: dictionary. Can contain those values:
        "num_images_raw": Number of the images to select from MNIST. Later only non-similar images will
        be kept.
        "selection_random_seed": Random seed for selecting random images from entire MNIST.
        "num_pca_dimensions": number of kept dimesions after PCA decomposition.
        "pca_random_seed": random seed for PCA calculation.
        "tsne_random_seed": random seed for tSNE algorithm
        "tsne_perpelxity": perplexity for tSNE algorithm
    :param recursive_regenerate: Regenerate predecessors as well
    """
    import lion_tsne
    tsne_random_seed = parameters.get("tsne_random_seed",
                                      settings.parameters["tsne_random_seed"])
    tsne_perplexity = parameters.get("tsne_perplexity",
                                     settings.parameters["tsne_perplexity"])
    tsne_momentum = parameters.get("tsne_momentum",
                                   settings.parameters["tsne_momentum"])
    tsne_n_iters = parameters.get("tsne_n_iters",
                                  settings.parameters["tsne_n_iters"])
    tsne_early_exaggeration_iters = parameters.get(
        "tsne_early_exaggeration_iters",
        settings.parameters["tsne_early_exaggeration_iters"])

    X_mnist = load_x_mnist(parameters=parameters,
                           regenerate=recursive_regenerate,
                           recursive_regenerate=recursive_regenerate)

    dTSNE_mnist = lion_tsne.LionTSNE(perplexity=tsne_perplexity)
    Y_mnist = dTSNE_mnist.fit(X_mnist,
                              optimizer_kwargs={
                                  'momentum':
                                  tsne_momentum,
                                  'n_iter':
                                  tsne_n_iters,
                                  'early_exaggeration_iters':
                                  tsne_early_exaggeration_iters
                              },
                              random_seed=tsne_random_seed,
                              verbose=2)
    save_and_report(get_y_mnist_filename, parameters, Y_mnist)
    save_and_report(get_dtsne_mnist_filename, parameters, dTSNE_mnist)
예제 #2
0
import os
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

# Importing from parent directory
sys.path.append(
    os.path.dirname(os.path.dirname(os.path.dirname(
        os.path.abspath(__file__)))))
import lion_tsne

if __name__ == "__main__":
    data = load_iris()
    X = data.data
    labels = data.target

    dTSNE = lion_tsne.LionTSNE(perplexity=20)
    # Small dataset. Iterations are very fast, we can afford more
    y = dTSNE.fit(X,
                  verbose=2,
                  optimizer_kwargs={
                      'momentum': 0.8,
                      'n_iter': 3000
                  },
                  random_seed=1)

    plt.gcf().set_size_inches(10, 10)
    legend_list = list()
    for l in set(sorted(labels)):
        plt.scatter(y[labels == l, 0], y[labels == l, 1])
        legend_list.append(str(data.target_names[l]))
    plt.legend(legend_list)
def load_mnist_data(prepare_cluster_attribution_test=True):
    """
    Loads a subset of MNIST dataset and a result of its tSNE processing.

    :param prepare_cluster_attribution_test: Will prepare additional data for cluster attribution test

    :return: a tuple with several values
      [0] - X_mnist, 2500x30, first index - MNIST image, second index - first 30 PCA components
      [1] - Y_mnist, 2500x2, first index - MNIST image (same sa above), second index - tSNE results
      [2] - X_mnist_raw, 2500x784, first index - MNIST image, second index - unrolled 28x28 grayscale image (values
            between 0 and 1)
      [3] - labels_mnist, 2500-long array of labels for images
      [4] - dTSNE_mnist, LION-tSNE incorporation of the data. Actually, you can generate other embeddings from that
            object as well.
      [5] - mnist_chosen_indices, 2500-long array, which elements of MNIST dataset correspond to 2500-long subset
    If prepare_cluster_attribution_test:
      [6] - X_mnist_unchosen_raw
      [7] - X_mnist_unchosen_pca
      [8] - labels_mnist_unchosen
    """

    print("Loading from file...")

    with open(mnist_file, 'rb') as f:
        X_mnist_raw, P_mnist, sigma_mnist, Y_mnist, labels_mnist_onehot, mnist_pca, \
        all_mnist_trained_images, all_mnist_labels, mnist_chosen_indices = pickle.load(f)

    labels_mnist = np.argmax(labels_mnist_onehot, axis=1)

    temp = np.ascontiguousarray(X_mnist_raw).view(
        np.dtype((np.void, X_mnist_raw.dtype.itemsize * X_mnist_raw.shape[1])))
    _, un_idx = np.unique(temp, return_index=True)
    X_mnist_raw = X_mnist_raw[un_idx, :]
    labels_mnist = labels_mnist[un_idx]
    mnist_chosen_indices = mnist_chosen_indices[un_idx]

    X_mnist = mnist_pca.transform(X_mnist_raw)
    dTSNE_mnist = lion_tsne.LionTSNE(perplexity=30)
    dTSNE_mnist.incorporate(x=X_mnist,
                            y=Y_mnist,
                            p_matrix=P_mnist,
                            sigma=sigma_mnist)

    D = distance.pdist(X_mnist)
    min_dist = np.min(D)
    print("After PCA - minimum distance between samples is ", min_dist,
          "\nExpected: ", expected_min_dist, "\nDifference: ",
          min_dist - expected_min_dist)

    return_tuple = X_mnist, Y_mnist, X_mnist_raw, labels_mnist, dTSNE_mnist, mnist_chosen_indices

    if prepare_cluster_attribution_test:
        print("Generating data for cluster attribution test")
        ind_unchosen = [
            i for i in range(len(all_mnist_labels))
            if i not in mnist_chosen_indices
        ]
        np.random.seed(
            10
        )  # Any seed, just don't use it again for selecting indices from same dataset

        X_mnist_unchosen_raw = all_mnist_trained_images[ind_unchosen]
        labels_mnist_unchosen = all_mnist_trained_images[ind_unchosen]
        labels_mnist_unchosen = np.argmax(labels_mnist_unchosen, axis=1)
        X_mnist_unchosen_pca = mnist_pca.transform(X_mnist_unchosen_raw)
        return_tuple += X_mnist_unchosen_raw, X_mnist_unchosen_pca, labels_mnist_unchosen

    return return_tuple
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import numpy as np
import lion_tsne

# This is 1-D illustration of "pull to the center" effect. Plot will appear in the discussion section
x_1d = np.array([[10], [20], [30], [40]])
y_1d = np.array([[10], [40], [1], [50]])
simple_example_model = lion_tsne.LionTSNE(perplexity=2)
simple_example_model.incorporate(x_1d, y_1d)

x = np.arange(0, 50, 0.1).reshape((-1, 1))

powers_and_colors = {0.2: 'blue', 2: 'red', 20: 'green'}
legend_by_p = {0.2: r'Low $p$', 2: r'Medium $p$', 20: r'High $p$'}

plt.figure(dpi=300)
plt.gcf().set_size_inches(3.3, 2)

font_properties = FontProperties()
font_properties.set_family('serif')
font_properties.set_name('Times New Roman')
font_properties.set_size(8)

legend_list = list()
for p in powers_and_colors:
    interpolator = simple_example_model.generate_embedding_function(
        embedding_function_type='weighted-inverse-distance',
        function_kwargs={'power': p})
    y_weighted = interpolator(x)
    plt.plot(x, y_weighted)