def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) distances = distances.dot(distances.T) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, False) kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] Pbh = _joint_probabilities_nn(distances, neighbors_nn, perplexity, False) kl, gradbh = _kl_divergence_bh(params, Pbh, neighbors_nn, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=False) assert_array_almost_equal(Pbh, P, decimal=5) assert_array_almost_equal(gradex, gradbh, decimal=5)
def test_binary_perplexity_stability(): # Binary perplexity search should be stable. # The binary_search_perplexity had a bug wherein the P array # was uninitialized, leading to sporadically failing tests. k = 10 n_samples = 100 random_state = check_random_state(0) distances = random_state.randn(n_samples, 2).astype(np.float32) # Distances shouldn't be negative distances = np.abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) last_P = None neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) for _ in range(100): P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(), 3, verbose=0) P1 = _joint_probabilities_nn(distances, neighbors_nn, 3, verbose=0) # Convert the sparse matrix to a dense one for testing P1 = P1.toarray() if last_P is None: last_P = P last_P1 = P1 else: assert_array_almost_equal(P, last_P, decimal=4) assert_array_almost_equal(P1, last_P1, decimal=4)
def compute_P(X, perplexity): """ utils function to calculate P matrix in high dim /opt/anaconda3/lib/python3.6/site-packages/sklearn/manifold/t_sne.py TODO: try to cache """ n_samples = X.shape[0] # Compute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. k = min(n_samples - 1, int(3. * perplexity + 1)) # Find the nearest neighbors for every point knn = NearestNeighbors(algorithm='auto', n_neighbors=k) knn.fit(X) distances_nn, neighbors_nn = knn.kneighbors(None, n_neighbors=k) # Free the memory used by the ball_tree del knn # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the # the method was derived using the euclidean method as in the # input space. Not sure of the implication of using a different # metric. distances_nn **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities_nn( distances_nn, neighbors_nn, perplexity, verbose=0) # return P.todense() return np.maximum(P.todense(), MACHINE_EPSILON)
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) data = random_state.randn(n_samples, n_features) distances = pairwise_distances(data) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) n_neighbors = n_samples - 1 distances_csr = NearestNeighbors().fit(data).kneighbors_graph( n_neighbors=n_neighbors, mode='distance') P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0) P = squareform(P) P_bh = P_bh.toarray() assert_array_almost_equal(P_bh, P, decimal=5) assert_almost_equal(kl_exact, kl_bh, decimal=3)
def test_binary_perplexity_stability(): # Binary perplexity search should be stable. # The binary_search_perplexity had a bug wherein the P array # was uninitialized, leading to sporadically failing tests. n_neighbors = 10 n_samples = 100 random_state = check_random_state(0) data = random_state.randn(n_samples, 5) nn = NearestNeighbors().fit(data) distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode='distance') distances = distance_graph.data.astype(np.float32, copy=False) distances = distances.reshape(n_samples, n_neighbors) last_P = None desired_perplexity = 3 for _ in range(100): P = _binary_search_perplexity(distances.copy(), desired_perplexity, verbose=0) P1 = _joint_probabilities_nn(distance_graph, desired_perplexity, verbose=0) # Convert the sparse matrix to a dense one for testing P1 = P1.toarray() if last_P is None: last_P = P last_P1 = P1 else: assert_array_almost_equal(P, last_P, decimal=4) assert_array_almost_equal(P1, last_P1, decimal=4)
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) distances = abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] distances_nn = np.array( [distances[i, neighbors_nn[i]] for i in range(n_samples)]) assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\ abs(distances[0, neighbors_nn[0]] - distances_nn[0]) P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0) P = squareform(P) P_bh = P_bh.toarray() assert_array_almost_equal(P_bh, P, decimal=5) assert_almost_equal(kl_exact, kl_bh, decimal=3)
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) distances = abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] distances_nn = np.array([distances[i, neighbors_nn[i]] for i in range(n_samples)]) assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\ abs(distances[0, neighbors_nn[0]] - distances_nn[0]) P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0) P = squareform(P) P_bh = P_bh.toarray() assert_array_almost_equal(P_bh, P, decimal=5) assert_almost_equal(kl_exact, kl_bh, decimal=3)
def _fit(self, X, skip_num_points=0): if self.method not in ['barnes_hut', 'exact']: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if self.metric == "precomputed": if isinstance(self.init, string_types) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " "used with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") if np.any(X < 0): raise ValueError("All distances should be positive, the " "precomputed distances given as X is not " "correct") if self.method == 'barnes_hut' and sp.issparse(X): raise TypeError('A sparse matrix was passed, but dense ' 'data is required for method="barnes_hut". Use ' 'X.toarray() to convert to a dense numpy array if ' 'the array is small enough for it to fit in ' 'memory. Otherwise consider dimensionality ' 'reduction techniques (e.g. TruncatedSVD)') else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float32, np.float64]) if self.method == 'barnes_hut' and self.n_components > 3: raise ValueError("'n_components' should be inferior to 4 for the " "barnes_hut algorithm as it relies on " "quad-tree or oct-tree.") random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError( "early_exaggeration must be at least 1, but is {}".format( self.early_exaggeration)) if self.n_iter < 250: raise ValueError("n_iter should be at least 250") n_samples = X.shape[0] neighbors_nn = None if self.method == "exact": if self.metric == "precomputed": distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": distances = pairwise_distances( X, metric=self.metric, squared=True, **self.metric_params) # <=ADDED else: distances = pairwise_distances( X, metric=self.metric, **self.metric_params) # <=ADDED if np.any(distances < 0): raise ValueError("All distances should be positive, the " "metric given is not correct") P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") else: k = min(n_samples - 1, int(3. * self.perplexity + 1)) if self.verbose: print("[t-SNE] Computing {} nearest neighbors...".format(k)) knn = NearestNeighbors(algorithm='auto', n_neighbors=k, metric=self.metric, metric_params=self.metric_params) # <=ADDED t0 = time() knn.fit(X) duration = time() - t0 if self.verbose: print("[t-SNE] Indexed {} samples in {:.3f}s...".format( n_samples, duration)) t0 = time() distances_nn, neighbors_nn = knn.kneighbors(None, n_neighbors=k) duration = time() - t0 if self.verbose: print( "[t-SNE] Computed neighbors for {} samples in {:.3f}s...". format(n_samples, duration)) del knn if self.metric == "euclidean": distances_nn **= 2 P = _joint_probabilities_nn(distances_nn, neighbors_nn, self.perplexity, self.verbose) if isinstance(self.init, np.ndarray): X_embedded = self.init elif self.init == 'pca': pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) elif self.init == 'random': X_embedded = 1e-4 * random_state.randn( n_samples, self.n_components).astype(np.float32) else: raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") degrees_of_freedom = max(self.n_components - 1.0, 1) return self._tsne(P, degrees_of_freedom, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)
def compute_joint_probabilities(X, perplexity=30, metric='euclidean', method='exact', adj=None, verbose=0): """ Computes the joint probability matrix P from a feature matrix X of size n x f Adapted from sklearn.manifold.t_sne """ # Compute pairwise distances if verbose > 0: print('Computing pairwise distances...') if method == 'exact': if metric == 'precomputed': D = X elif metric == 'euclidean': D = pairwise_distances(X, metric=metric, squared=True) elif metric == 'cosine': D = pairwise_distances(X, metric=metric) elif metric == 'shortest_path': assert adj is not None D = get_shortest_path_matrix(adj, verbose=verbose) P = _joint_probabilities(D, desired_perplexity=perplexity, verbose=verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") P = squareform(P) else: # Cpmpute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. n_samples = X.shape[0] k = min(n_samples - 1, int(3. * perplexity + 1)) # Find the nearest neighbors for every point knn = NearestNeighbors(algorithm='auto', n_neighbors=k, metric=metric) t0 = time() knn.fit(X) duration = time() - t0 if verbose: print("[t-SNE] Indexed {} samples in {:.3f}s...".format( n_samples, duration)) t0 = time() distances_nn, neighbors_nn = knn.kneighbors( None, n_neighbors=k) duration = time() - t0 if verbose: print("[t-SNE] Computed neighbors for {} samples in {:.3f}s..." .format(n_samples, duration)) # Free the memory used by the ball_tree del knn if metric == "euclidean": # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the # the method was derived using the euclidean method as in the # input space. Not sure of the implication of using a different # metric. distances_nn **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities_nn(distances_nn, neighbors_nn, perplexity, verbose) P = P.toarray() # Convert to torch tensor P = torch.from_numpy(P).type(dtypeFloat) return P
def _fit(self, X, skip_num_points=0): """Fit the model using X as training data. Note that sparse arrays can only be handled by method='exact'. It is recommended that you convert your sparse array to dense (e.g. `X.toarray()`) if it fits in memory, or otherwise using a dimensionality reduction technique (e.g. TruncatedSVD). Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. Note that this when method='barnes_hut', X cannot be a sparse array and if need be will be converted to a 32 bit float array. Method='exact' allows sparse arrays and 64bit floating point inputs. skip_num_points : int (optional, default:0) This does not compute the gradient for points with indices below `skip_num_points`. This is useful when computing transforms of new data where you'd like to keep the old data fixed. """ if self.method not in ['barnes_hut', 'exact']: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if self.method == 'barnes_hut' and sp.issparse(X): raise TypeError('A sparse matrix was passed, but dense ' 'data is required for method="barnes_hut". Use ' 'X.toarray() to convert to a dense numpy array if ' 'the array is small enough for it to fit in ' 'memory. Otherwise consider dimensionality ' 'reduction techniques (e.g. TruncatedSVD)') else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: raise ValueError("early_exaggeration must be at least 1, but is " "%f" % self.early_exaggeration) if self.n_iter < 200: raise ValueError("n_iter should be at least 200") if self.metric == "precomputed": if isinstance(self.init, string_types) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be used " "with metric=\"precomputed\".") if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") distances = X else: if self.verbose: print("[t-SNE] Computing pairwise distances...") if self.metric == "euclidean": distances = pairwise_distances(X, metric=self.metric, squared=True) else: distances = pairwise_distances(X, metric=self.metric) if not np.all(distances >= 0): raise ValueError("All distances should be positive, either " "the metric or precomputed distances given " "as X are not correct") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from # "Learning a Parametric Embedding by Preserving Local Structure" # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1.0, 1) n_samples = X.shape[0] # the number of nearest neighbors to find k = min(n_samples - 1, int(3. * self.perplexity + 1)) neighbors_nn = None if self.method == 'barnes_hut': if self.verbose: print("[t-SNE] Computing %i nearest neighbors..." % k) if self.metric == 'precomputed': # Use the precomputed distances to find # the k nearest neighbors and their distances neighbors_nn = np.argsort(distances, axis=1)[:, :k] elif self.rho >= 1: # Find the nearest neighbors for every point bt = BallTree(X) # LvdM uses 3 * perplexity as the number of neighbors # And we add one to not count the data point itself # In the event that we have very small # of points # set the neighbors to n - 1 distances_nn, neighbors_nn = bt.query(X, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] elif self.rho < 1: # Use pyFLANN to find the nearest neighbors myflann = FLANN() testset = X params = myflann.build_index(testset, algorithm="autotuned", target_precision=self.rho, log_level='info') neighbors_nn, distances = myflann.nn_index( testset, k + 1, checks=params["checks"]) neighbors_nn = neighbors_nn[:, 1:] P = _joint_probabilities_nn(distances, neighbors_nn, self.perplexity, self.verbose) else: P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be zero or positive" assert np.all(P <= 1), ("All probabilities should be less " "or then equal to one") if isinstance(self.init, np.ndarray): X_embedded = self.init elif self.init == 'pca': pca = PCA(n_components=self.n_components, svd_solver='randomized', random_state=random_state) X_embedded = pca.fit_transform(X) elif self.init == 'random': X_embedded = None else: raise ValueError("Unsupported initialization scheme: %s" % self.init) return self._tsne(P, degrees_of_freedom, n_samples, random_state, X_embedded=X_embedded, neighbors=neighbors_nn, skip_num_points=skip_num_points)