def generate_y_mnist(*, parameters=settings.parameters, recursive_regenerate=False): """ PREDECESSOR: X_mnist :param parameters: dictionary. Can contain those values: "num_images_raw": Number of the images to select from MNIST. Later only non-similar images will be kept. "selection_random_seed": Random seed for selecting random images from entire MNIST. "num_pca_dimensions": number of kept dimesions after PCA decomposition. "pca_random_seed": random seed for PCA calculation. "tsne_random_seed": random seed for tSNE algorithm "tsne_perpelxity": perplexity for tSNE algorithm :param recursive_regenerate: Regenerate predecessors as well """ import lion_tsne tsne_random_seed = parameters.get("tsne_random_seed", settings.parameters["tsne_random_seed"]) tsne_perplexity = parameters.get("tsne_perplexity", settings.parameters["tsne_perplexity"]) tsne_momentum = parameters.get("tsne_momentum", settings.parameters["tsne_momentum"]) tsne_n_iters = parameters.get("tsne_n_iters", settings.parameters["tsne_n_iters"]) tsne_early_exaggeration_iters = parameters.get( "tsne_early_exaggeration_iters", settings.parameters["tsne_early_exaggeration_iters"]) X_mnist = load_x_mnist(parameters=parameters, regenerate=recursive_regenerate, recursive_regenerate=recursive_regenerate) dTSNE_mnist = lion_tsne.LionTSNE(perplexity=tsne_perplexity) Y_mnist = dTSNE_mnist.fit(X_mnist, optimizer_kwargs={ 'momentum': tsne_momentum, 'n_iter': tsne_n_iters, 'early_exaggeration_iters': tsne_early_exaggeration_iters }, random_seed=tsne_random_seed, verbose=2) save_and_report(get_y_mnist_filename, parameters, Y_mnist) save_and_report(get_dtsne_mnist_filename, parameters, dTSNE_mnist)
import os from sklearn.datasets import load_iris import matplotlib.pyplot as plt # Importing from parent directory sys.path.append( os.path.dirname(os.path.dirname(os.path.dirname( os.path.abspath(__file__))))) import lion_tsne if __name__ == "__main__": data = load_iris() X = data.data labels = data.target dTSNE = lion_tsne.LionTSNE(perplexity=20) # Small dataset. Iterations are very fast, we can afford more y = dTSNE.fit(X, verbose=2, optimizer_kwargs={ 'momentum': 0.8, 'n_iter': 3000 }, random_seed=1) plt.gcf().set_size_inches(10, 10) legend_list = list() for l in set(sorted(labels)): plt.scatter(y[labels == l, 0], y[labels == l, 1]) legend_list.append(str(data.target_names[l])) plt.legend(legend_list)
def load_mnist_data(prepare_cluster_attribution_test=True): """ Loads a subset of MNIST dataset and a result of its tSNE processing. :param prepare_cluster_attribution_test: Will prepare additional data for cluster attribution test :return: a tuple with several values [0] - X_mnist, 2500x30, first index - MNIST image, second index - first 30 PCA components [1] - Y_mnist, 2500x2, first index - MNIST image (same sa above), second index - tSNE results [2] - X_mnist_raw, 2500x784, first index - MNIST image, second index - unrolled 28x28 grayscale image (values between 0 and 1) [3] - labels_mnist, 2500-long array of labels for images [4] - dTSNE_mnist, LION-tSNE incorporation of the data. Actually, you can generate other embeddings from that object as well. [5] - mnist_chosen_indices, 2500-long array, which elements of MNIST dataset correspond to 2500-long subset If prepare_cluster_attribution_test: [6] - X_mnist_unchosen_raw [7] - X_mnist_unchosen_pca [8] - labels_mnist_unchosen """ print("Loading from file...") with open(mnist_file, 'rb') as f: X_mnist_raw, P_mnist, sigma_mnist, Y_mnist, labels_mnist_onehot, mnist_pca, \ all_mnist_trained_images, all_mnist_labels, mnist_chosen_indices = pickle.load(f) labels_mnist = np.argmax(labels_mnist_onehot, axis=1) temp = np.ascontiguousarray(X_mnist_raw).view( np.dtype((np.void, X_mnist_raw.dtype.itemsize * X_mnist_raw.shape[1]))) _, un_idx = np.unique(temp, return_index=True) X_mnist_raw = X_mnist_raw[un_idx, :] labels_mnist = labels_mnist[un_idx] mnist_chosen_indices = mnist_chosen_indices[un_idx] X_mnist = mnist_pca.transform(X_mnist_raw) dTSNE_mnist = lion_tsne.LionTSNE(perplexity=30) dTSNE_mnist.incorporate(x=X_mnist, y=Y_mnist, p_matrix=P_mnist, sigma=sigma_mnist) D = distance.pdist(X_mnist) min_dist = np.min(D) print("After PCA - minimum distance between samples is ", min_dist, "\nExpected: ", expected_min_dist, "\nDifference: ", min_dist - expected_min_dist) return_tuple = X_mnist, Y_mnist, X_mnist_raw, labels_mnist, dTSNE_mnist, mnist_chosen_indices if prepare_cluster_attribution_test: print("Generating data for cluster attribution test") ind_unchosen = [ i for i in range(len(all_mnist_labels)) if i not in mnist_chosen_indices ] np.random.seed( 10 ) # Any seed, just don't use it again for selecting indices from same dataset X_mnist_unchosen_raw = all_mnist_trained_images[ind_unchosen] labels_mnist_unchosen = all_mnist_trained_images[ind_unchosen] labels_mnist_unchosen = np.argmax(labels_mnist_unchosen, axis=1) X_mnist_unchosen_pca = mnist_pca.transform(X_mnist_unchosen_raw) return_tuple += X_mnist_unchosen_raw, X_mnist_unchosen_pca, labels_mnist_unchosen return return_tuple
import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties import numpy as np import lion_tsne # This is 1-D illustration of "pull to the center" effect. Plot will appear in the discussion section x_1d = np.array([[10], [20], [30], [40]]) y_1d = np.array([[10], [40], [1], [50]]) simple_example_model = lion_tsne.LionTSNE(perplexity=2) simple_example_model.incorporate(x_1d, y_1d) x = np.arange(0, 50, 0.1).reshape((-1, 1)) powers_and_colors = {0.2: 'blue', 2: 'red', 20: 'green'} legend_by_p = {0.2: r'Low $p$', 2: r'Medium $p$', 20: r'High $p$'} plt.figure(dpi=300) plt.gcf().set_size_inches(3.3, 2) font_properties = FontProperties() font_properties.set_family('serif') font_properties.set_name('Times New Roman') font_properties.set_size(8) legend_list = list() for p in powers_and_colors: interpolator = simple_example_model.generate_embedding_function( embedding_function_type='weighted-inverse-distance', function_kwargs={'power': p}) y_weighted = interpolator(x) plt.plot(x, y_weighted)