def test_binary_search_neighbors(): # Binary perplexity search approximation. # Should be approximately equal to the slow method when we use # all points as neighbors. n_samples = 500 desired_perplexity = 25.0 random_state = check_random_state(0) distances = random_state.randn(n_samples, 2).astype(np.float32) # Distances shouldn't be negative distances = np.abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) P1 = _binary_search_perplexity(distances, None, desired_perplexity, verbose=0) # Test that when we use all the neighbors the results are identical k = n_samples neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) P2 = _binary_search_perplexity(distances, neighbors_nn, desired_perplexity, verbose=0) assert_array_almost_equal(P1, P2, decimal=4) # Test that the highest P_ij are the same when few neighbors are used for k in np.linspace(80, n_samples, 10): k = int(k) topn = k * 10 # check the top 10 *k entries out of k * k entries neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) P2k = _binary_search_perplexity(distances, neighbors_nn, desired_perplexity, verbose=0) idx = np.argsort(P1.ravel())[::-1] P1top = P1.ravel()[idx][:topn] P2top = P2k.ravel()[idx][:topn] assert_array_almost_equal(P1top, P2top, decimal=2)
def test_binary_perplexity_stability(): # Binary perplexity search should be stable. # The binary_search_perplexity had a bug wherein the P array # was uninitialized, leading to sporadically failing tests. n_neighbors = 10 n_samples = 100 random_state = check_random_state(0) data = random_state.randn(n_samples, 5) nn = NearestNeighbors().fit(data) distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode='distance') distances = distance_graph.data.astype(np.float32, copy=False) distances = distances.reshape(n_samples, n_neighbors) last_P = None desired_perplexity = 3 for _ in range(100): P = _binary_search_perplexity(distances.copy(), desired_perplexity, verbose=0) P1 = _joint_probabilities_nn(distance_graph, desired_perplexity, verbose=0) # Convert the sparse matrix to a dense one for testing P1 = P1.toarray() if last_P is None: last_P = P last_P1 = P1 else: assert_array_almost_equal(P, last_P, decimal=4) assert_array_almost_equal(P1, last_P1, decimal=4)
def _joint_probabilities(distances, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances. Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = distances.astype(np.float32, copy=False) conditional_P = _utils._binary_search_perplexity( distances, None, desired_perplexity, verbose) P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) return P
def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) neighbors = astype(neighbors, np.int64, copy=False) conditional_P = _utils._binary_search_perplexity(distances, neighbors, desired_perplexity, verbose) m = "All probabilities should be finite" assert np.all(np.isfinite(conditional_P)), m P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) assert np.all(np.abs(P) <= 1.0) return P
def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) neighbors = astype(neighbors, np.int64, copy=False) conditional_P = _utils._binary_search_perplexity( distances, neighbors, desired_perplexity, verbose) m = "All probabilities should be finite" assert np.all(np.isfinite(conditional_P)), m P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) assert np.all(np.abs(P) <= 1.0) return P
def _joint_probabilities(distances, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances. Parameters ---------- distances : array, shape (n_samples * (n_samples-1) / 2,) Distances of samples are stored as condensed matrices, i.e. we omit the diagonal and duplicate entries and store everything in a one-dimensional array. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : array, shape (n_samples * (n_samples-1) / 2,) Condensed joint probability matrix. """ # Compute conditional probabilities such that they approximately match # the desired perplexity distances = astype(distances, np.float32, copy=False) conditional_P = _utils._binary_search_perplexity( distances, None, desired_perplexity, verbose) P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) return P
def test_binary_perplexity_stability(): # Binary perplexity search should be stable. # The binary_search_perplexity had a bug wherein the P array # was uninitialized, leading to sporadically failing tests. k = 10 n_samples = 100 random_state = check_random_state(0) distances = random_state.randn(n_samples, 2).astype(np.float32) # Distances shouldn't be negative distances = np.abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) last_P = None neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) for _ in range(100): P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(), 3, verbose=0) P1 = _joint_probabilities_nn(distances, neighbors_nn, 3, verbose=0) # Convert the sparse matrix to a dense one for testing P1 = P1.toarray() if last_P is None: last_P = P last_P1 = P1 else: assert_array_almost_equal(P, last_P, decimal=4) assert_array_almost_equal(P1, last_P1, decimal=4)
def _joint_probabilities(distances, desired_perplexity, verbose=0): distances = distances.astype(np.float32, copy=True) conditional_P = _binary_search_perplexity(distances, None, desired_perplexity, verbose) P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON_NP) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON_NP) return P
def _joint_probabilities(distances, desired_perplexity, verbose): # Compute conditional probabilities such that they approximately match # the desired perplexity distances = distances.astype(np.float32, copy=False) conditional_P = _utils._binary_search_perplexity(distances, desired_perplexity, verbose) P = conditional_P + conditional_P.T sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) return P
def test_binary_search_underflow(): # Test if the binary search finds Gaussians with desired perplexity. # A more challenging case than the one above, producing numeric # underflow in float precision (see issue #19471 and PR #19472). random_state = check_random_state(42) data = random_state.randn(1, 90).astype(np.float32) + 100 desired_perplexity = 30.0 P = _binary_search_perplexity(data, desired_perplexity, verbose=0) perplexity = 2 ** -np.nansum(P[0, 1:] * np.log2(P[0, 1:])) assert_almost_equal(perplexity, desired_perplexity, decimal=3)
def test_binary_search_neighbors(): # Binary perplexity search approximation. # Should be approximately equal to the slow method when we use # all points as neighbors. n_samples = 200 desired_perplexity = 25.0 random_state = check_random_state(0) data = random_state.randn(n_samples, 2).astype(np.float32, copy=False) distances = pairwise_distances(data) P1 = _binary_search_perplexity(distances, desired_perplexity, verbose=0) # Test that when we use all the neighbors the results are identical n_neighbors = n_samples - 1 nn = NearestNeighbors().fit(data) distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors, mode='distance') distances_nn = distance_graph.data.astype(np.float32, copy=False) distances_nn = distances_nn.reshape(n_samples, n_neighbors) P2 = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0) indptr = distance_graph.indptr P1_nn = np.array([ P1[k, distance_graph.indices[indptr[k]:indptr[k + 1]]] for k in range(n_samples) ]) assert_array_almost_equal(P1_nn, P2, decimal=4) # Test that the highest P_ij are the same when fewer neighbors are used for k in np.linspace(150, n_samples - 1, 5): k = int(k) topn = k * 10 # check the top 10 * k entries out of k * k entries distance_graph = nn.kneighbors_graph(n_neighbors=k, mode='distance') distances_nn = distance_graph.data.astype(np.float32, copy=False) distances_nn = distances_nn.reshape(n_samples, k) P2k = _binary_search_perplexity(distances_nn, desired_perplexity, verbose=0) assert_array_almost_equal(P1_nn, P2, decimal=2) idx = np.argsort(P1.ravel())[::-1] P1top = P1.ravel()[idx][:topn] idx = np.argsort(P2k.ravel())[::-1] P2top = P2k.ravel()[idx][:topn] assert_array_almost_equal(P1top, P2top, decimal=2)
def _joint_probabilities_nn(distances, neighbors, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : array, shape (n_samples, k) Distances of samples to its k nearest neighbors. neighbors : array, shape (n_samples, k) Indices of the k nearest-neighbors for each samples. desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : csr sparse matrix, shape (n_samples, n_samples) Condensed joint probability matrix with only nearest neighbors. """ t0 = time() # Compute conditional probabilities such that they approximately match # the desired perplexity n_samples, k = neighbors.shape distances = distances.astype(np.float32, copy=False) neighbors = neighbors.astype(np.int64, copy=False) conditional_P = _utils._binary_search_perplexity( distances, neighbors, desired_perplexity, verbose) assert np.all(np.isfinite(conditional_P)), \ "All probabilities should be finite" # Symmetrize the joint probability distribution using sparse operations P = csr_matrix((conditional_P.ravel(), neighbors.ravel(), range(0, n_samples * k + 1, k)), shape=(n_samples, n_samples)) P = P + P.T # Normalize the joint probability distribution sum_P = np.maximum(P.sum(), MACHINE_EPSILON) P /= sum_P assert np.all(np.abs(P.data) <= 1.0) if verbose >= 2: duration = time() - t0 print("[t-SNE] Computed conditional probabilities in {:.3f}s" .format(duration)) return P
def test_binary_search(): # Test if the binary search finds Gaussians with desired perplexity. random_state = check_random_state(0) data = random_state.randn(50, 5) distances = pairwise_distances(data).astype(np.float32) desired_perplexity = 25.0 P = _binary_search_perplexity(distances, desired_perplexity, verbose=0) P = np.maximum(P, np.finfo(np.double).eps) mean_perplexity = np.mean([np.exp(-np.sum(P[i] * np.log(P[i]))) for i in range(P.shape[0])]) assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
def _joint_probabilities_nn(distances, desired_perplexity, verbose): """Compute joint probabilities p_ij from distances using just nearest neighbors. This method is approximately equal to _joint_probabilities. The latter is O(N), but limiting the joint probability to nearest neighbors improves this substantially to O(uN). Parameters ---------- distances : CSR sparse matrix, shape (n_samples, n_samples) Distances of samples to its n_neighbors nearest neighbors. All other distances are left to zero (and are not materialized in memory). desired_perplexity : float Desired perplexity of the joint probability distributions. verbose : int Verbosity level. Returns ------- P : csr sparse matrix, shape (n_samples, n_samples) Condensed joint probability matrix with only nearest neighbors. """ t0 = time() # Compute conditional probabilities such that they approximately match # the desired perplexity distances.sort_indices() n_samples = distances.shape[0] distances_data = distances.data.reshape(n_samples, -1) distances_data = distances_data.astype(np.float32, copy=False) conditional_P = _utils._binary_search_perplexity( distances_data, desired_perplexity, verbose) assert np.all(np.isfinite(conditional_P)), \ "All probabilities should be finite" # Symmetrize the joint probability distribution using sparse operations P = csr_matrix((conditional_P.ravel(), distances.indices, distances.indptr), shape=(n_samples, n_samples)) P = P + P.T # Normalize the joint probability distribution sum_P = np.maximum(P.sum(), MACHINE_EPSILON) P /= sum_P assert np.all(np.abs(P.data) <= 1.0) if verbose >= 2: duration = time() - t0 print("[t-SNE] Computed conditional probabilities in {:.3f}s" .format(duration)) return P
def test_binary_search(): """Test if the binary search finds Gaussians with desired perplexity.""" random_state = check_random_state(0) distances = random_state.randn(50, 2) distances = distances.dot(distances.T) np.fill_diagonal(distances, 0.0) desired_perplexity = 25.0 P = _binary_search_perplexity(distances, desired_perplexity, verbose=0) P = np.maximum(P, np.finfo(np.double).eps) mean_perplexity = np.mean([np.exp(-np.sum(P[i] * np.log(P[i]))) for i in range(P.shape[0])]) assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
def test_binary_search(): """Test if the binary search finds Gaussians with desired perplexity.""" random_state = check_random_state(0) distances = random_state.randn(50, 2) distances = distances.dot(distances.T) np.fill_diagonal(distances, 0.0) desired_perplexity = 25.0 P = _binary_search_perplexity(distances, desired_perplexity, verbose=0) P = np.maximum(P, np.finfo(np.double).eps) mean_perplexity = np.mean( [np.exp(-np.sum(P[i] * np.log(P[i]))) for i in range(P.shape[0])]) assert_almost_equal(mean_perplexity, desired_perplexity, decimal=3)
def joint_probabilities_nn(D, target_PP): from sklearn.manifold._utils import _binary_search_perplexity from scipy.sparse import csr_matrix D.sort_indices() N = D.shape[0] D_data = D.data.reshape(N, -1) D_data = D_data.astype(np.float32, copy=False) conditional_P = _binary_search_perplexity(D_data, target_PP, 0) assert np.all(np.isfinite(conditional_P)) P = csr_matrix((conditional_P.ravel(), D.indices, D.indptr), shape=(N, N)) P = P + P.T sum_P = np.maximum(P.sum(), np.finfo(np.double).eps) P /= sum_P assert np.all(np.abs(P.data) <= 1.0) return P
def sample_weights_tsne_symmetric(data, perplexity, symmetric): """ Calculates p-values between samples using the procedure from tSNE As this version uses symmetric p-values, it must calculate the p-values for every sample vs every other sample data: pandas.DataFrame data matrix with samples as columns perplexity: float binary search perplexity target symmetric: boolean whether or not to symmetrize the weights """ columns = data.columns data = data.values # Calculate affinities (distance-squared) between samples sumData2 = np.sum(data**2, axis=0, keepdims=True) aff = -2 * np.dot(data.T, data) aff += sumData2 aff = aff.T aff += sumData2 np.fill_diagonal(aff, 0) aff = aff.astype('float32') # Run the tsne perplexity procedure pvals = _binary_search_perplexity(aff, perplexity, 0) # Symmetrize the pvals if symmetric: pvals += pvals.T pvals /= 2 # Make the rows sum to 1 pvals /= pvals.sum(axis=1, keepdims=True) pvals = pd.DataFrame(pvals, index=columns, columns=columns) return pvals
method = 'barnes_hut' angle = 0.5 n_samples = X.shape[0] neighbors_nn = None k = min(n_samples - 1, int(3. * perplexity + 1)) knn = NearestNeighbors(algorithm='auto', n_neighbors=k, metric=metric) knn.fit(X) distances_nn, neighbors_nn = knn.kneighbors(None, n_neighbors=k) del knn distances_nn **= 2 # in the function _joint_probabilities_nn ----from here distances = distances_nn.astype(np.float32, copy=True) neighbors = neighbors_nn.astype(np.int64, copy=True) conditional_P = _utils._binary_search_perplexity(distances, neighbors, perplexity, verbose) P = csr_matrix( (conditional_P.ravel(), neighbors.ravel(), range(0, n_samples * k + 1, k)), shape=(n_samples, n_samples)) P = P + P.T MACHINE_EPSILON = np.finfo(np.double).eps sum_P = np.maximum(P.sum(), MACHINE_EPSILON) P /= sum_P # in the function _joint_probabilities_nn ----till here # simplified _binary_search_perplexity() from github ---- from here # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/manifold/_utils.pyx sqdistances = distances desired_entropy = np.log(perplexity) PERPLEXITY_TOLEARANCE = 1e-5 Pi = np.zeros(sqdistances.shape)