def __tsne_test_helper(self, data, n_com): tsne_def = TSNE(n_components=n_com, metric="euclidean") tsne_def = tsne_def(data) tsne_euc = TSNE(n_components=n_com, metric=Euclidean) tsne_euc = tsne_euc(data) tsne_pre = TSNE(n_components=n_com, metric="precomputed") tsne_pre = tsne_pre(Euclidean(data)) self.assertEqual((data.X.shape[0], n_com), tsne_def.embedding_.shape) self.assertEqual((data.X.shape[0], n_com), tsne_euc.embedding_.shape) self.assertEqual((data.X.shape[0], n_com), tsne_pre.embedding_.shape)
def _reduce_dimensions(data, method="MDS", use_cosine=False): """ Reduce the dimensionality of the data to 2D. Parameters ---------- data: Orange.data.Table The image embeddings (vectors of length 2048). method: string The method to use (default MDS). use_cosine: bool Precompute cosine distances and pass them to MDS. Returns ------- array-like The data, reduced to 2 dimensions. """ if method == "MDS": if use_cosine: mds = MDS(n_init=1, dissimilarity="precomputed") dist_matrix = Cosine(data) return mds(dist_matrix).embedding_ else: mds = MDS(n_init=1, init_type="PCA") return mds(data).embedding_ elif method == "PCA": pca = PCA(n_components=2) return pca(data)(data) elif method == "TSNE": tsne = TSNE(init="pca") return tsne(data).embedding_
def test_transform(self): # Set perplexity to avoid warnings tsne = TSNE(perplexity=10) model = tsne(self.iris[::2]) new_embedding = model(self.iris[1::2]) # The new embedding should not contain NaNs self.assertFalse(np.any(np.isnan(new_embedding.X)))
def compute_tsne(data, perplexity, iter, init): negative_gradient_method = 'fft' if len(data.X) > 10000 else 'bh' neighbor_method = 'approx' if len(data.X) > 10000 else 'exact' tsne = TSNE( perplexity=perplexity, n_iter=iter, initialization=init, theta=.8, early_exaggeration_iter=0, negative_gradient_method=negative_gradient_method, neighbors=neighbor_method, random_state=0 ) return tsne(data)
def test_continue_optimization_inplace(self): tsne = TSNE(n_iter=100) model = tsne(self.iris) new_model = model.optimize(100, inplace=True) # If we don't do things inplace, then the instances should be the same self.assertIs(model, new_model) self.assertIs(model.embedding, new_model.embedding) self.assertIs(model.embedding_, new_model.embedding_) # The embeddings in the table should match the embedding object np.testing.assert_equal(new_model.embedding.X, new_model.embedding_)
def test_fit(self): n_components = 2 tsne = TSNE(n_components=n_components) model = tsne(self.iris) # The embedding should have the correct number of dimensions self.assertEqual(model.embedding.X.shape, (self.iris.X.shape[0], n_components)) # The embedding should not contain NaNs self.assertFalse(np.any(np.isnan(model.embedding.X))) # The embeddings in the table should match the embedding object np.testing.assert_equal(model.embedding.X, model.embedding_)
def __start(self): self.pca_preprocessing() # We call PCA through fastTSNE because it involves scaling. Instead of # worrying about this ourselves, we'll let the library worry for us. initialization = TSNE.default_initialization(self.pca_data.X, n_components=2, random_state=0) # Compute perplexity settings for multiscale n_samples = self.pca_data.X.shape[0] if self.multiscale: perplexity = min((n_samples - 1) / 3, 50), min((n_samples - 1) / 3, 500) else: perplexity = self.perplexity # Determine whether to use settings for large data sets if n_samples > 10_000: neighbor_method, gradient_method = "approx", "fft" else: neighbor_method, gradient_method = "exact", "bh" # Set number of iterations to 0 - these will be run subsequently self.projection = TSNE(n_components=2, perplexity=perplexity, multiscale=self.multiscale, early_exaggeration_iter=0, n_iter=0, initialization=initialization, exaggeration=self.exaggeration, neighbors=neighbor_method, negative_gradient_method=gradient_method, random_state=0)(self.pca_data) self.tsne_runner = TSNERunner(self.projection, step_size=50) self.tsne_iterator = self.tsne_runner.run_optimization() self.__set_update_loop(self.tsne_iterator) self.progressBarInit(processEvents=None)
def test_continue_optimization(self): tsne = TSNE(n_iter=100) model = tsne(self.iris) new_model = model.optimize(100, inplace=False) # If we don't do things inplace, then the instances should be different self.assertIsNot(model, new_model) self.assertIsNot(model.embedding, new_model.embedding) self.assertIsNot(model.embedding_, new_model.embedding_) self.assertFalse(np.allclose(model.embedding.X, new_model.embedding.X), 'Embedding should change after further optimization.') # The embeddings in the table should match the embedding object np.testing.assert_equal(new_model.embedding.X, new_model.embedding_)
def test_pickle(self): for neighbors in ("exact", "approx"): tsne = TSNE(early_exaggeration_iter=0, n_iter=10, perplexity=30, neighbors=neighbors, random_state=0) model = tsne(self.iris[::2]) loaded_model = pickle.loads(pickle.dumps(model)) new_embedding = loaded_model(self.iris[1::2]).X knn = KNeighborsClassifier(n_neighbors=5) knn.fit(new_embedding, self.iris[1::2].Y) predicted = knn.predict(new_embedding) self.assertTrue( accuracy_score(predicted, self.iris[1::2].Y) > 0.95, msg=f"Pickling failed with `neighbors={neighbors}`", )
def test_fft_correctness(self): knn = KNeighborsClassifier(n_neighbors=5) # Set iterations to 0 so we check that the initialization is fairly random tsne = TSNE(early_exaggeration_iter=0, n_iter=0, perplexity=30, negative_gradient_method='fft', initialization='random', random_state=0) model = tsne(self.iris) # Evaluate KNN on the random initialization knn.fit(model.embedding_, self.iris.Y) predicted = knn.predict(model.embedding_) self.assertTrue(accuracy_score(predicted, self.iris.Y) < 0.6) # 100 iterations should be enough for iris model.optimize(n_iter=100, inplace=True) # Evaluate KNN on the tSNE embedding knn.fit(model.embedding_, self.iris.Y) predicted = knn.predict(model.embedding_) self.assertTrue(accuracy_score(predicted, self.iris.Y) > 0.95)
def __start(self): self.pca_preprocessing() self.needs_to_draw = True # We call PCA through fastTSNE because it involves scaling. Instead of # worrying about this ourselves, we'll let the library worry for us. initialization = TSNE.default_initialization( self.pca_data.X, n_components=2, random_state=0) # Compute perplexity settings for multiscale n_samples = self.pca_data.X.shape[0] if self.multiscale: perplexity = min((n_samples - 1) / 3, 50), min((n_samples - 1) / 3, 500) else: perplexity = self.perplexity # Determine whether to use settings for large data sets if n_samples > 10_000: neighbor_method, gradient_method = "approx", "fft" else: neighbor_method, gradient_method = "exact", "bh" # Set number of iterations to 0 - these will be run subsequently self.projection = TSNE( n_components=2, perplexity=perplexity, multiscale=self.multiscale, early_exaggeration_iter=0, n_iter=0, initialization=initialization, exaggeration=self.exaggeration, neighbors=neighbor_method, negative_gradient_method=gradient_method, random_state=0, theta=0.8, )(self.pca_data) self.tsne_runner = TSNERunner( self.projection, step_size=20, exaggeration=self.exaggeration ) self.tsne_iterator = self.tsne_runner.run_optimization() self.__set_update_loop(self.tsne_iterator) self.progressBarInit(processEvents=None)
def test_multiscale(self): tsne = TSNE(perplexity=(10, 10), multiscale=True) model = tsne(self.iris[::2]) embedding = model(self.iris[1::2]) self.assertFalse(np.any(np.isnan(embedding.X)))