def tsne_emb(model_path: str, proc_dir: str, layer_name: str, perplexities: List[int], n_iter: int): model = torch.load(model_path, map_location=torch.device('cpu'))['model'] w = model[layer_name].numpy() affinities_multiscale_mixture = affinity.Multiscale( w, perplexities=perplexities, metric="cosine", n_jobs=-1, random_state=3) init = initialization.pca(w, random_state=42) embedding = TSNEEmbedding(init, affinities_multiscale_mixture, negative_gradient_method="fft", n_jobs=-1, random_state=4, verbose=True) embedding = embedding.optimize(n_iter=n_iter, exaggeration=None, momentum=0.8, learning_rate="auto") df = pd.DataFrame(embedding, columns=['x', 'y']) df.to_csv(path.join(proc_dir, 'tsne_emb.csv'), index=False) with open(path.join(proc_dir, 'kl_divergence'), 'w') as f: f.write(f'{embedding.kl_divergence:.4f}')
def test_affinity(self): init = initialization.random(self.x, random_state=0) for aff in [ affinity.PerplexityBasedNN(self.x, perplexity=30), affinity.Uniform(self.x, k_neighbors=30), affinity.FixedSigmaNN(self.x, sigma=1, k=30), affinity.Multiscale(self.x, perplexities=[10, 30]), affinity.MultiscaleMixture(self.x, perplexities=[10, 30]), ]: # Without initilization embedding = TSNE().fit(affinities=aff) self.eval_embedding(embedding, self.y, aff.__class__.__name__) new_embedding = embedding.prepare_partial(self.x) new_embedding.optimize(10, learning_rate=0.1, inplace=True) self.eval_embedding(new_embedding, self.y, f"transform::{aff.__class__.__name__}") # With initilization embedding = TSNE().fit(affinities=aff, initialization=init) self.eval_embedding(embedding, self.y, aff.__class__.__name__) new_embedding = embedding.prepare_partial(self.x) new_embedding.optimize(10, learning_rate=0.1, inplace=True) self.eval_embedding(new_embedding, self.y, f"transform::{aff.__class__.__name__}")
def test_affinity_with_precomputed_neighbors(self): nn = NearestNeighbors(n_neighbors=30) nn.fit(self.x) distances, neighbors = nn.kneighbors(n_neighbors=30) knn_index = nearest_neighbors.PrecomputedNeighbors( neighbors, distances) init = initialization.random(self.x, random_state=0) for aff in [ affinity.PerplexityBasedNN(knn_index=knn_index, perplexity=30), affinity.Uniform(knn_index=knn_index, k_neighbors=30), affinity.FixedSigmaNN(knn_index=knn_index, sigma=1), affinity.Multiscale(knn_index=knn_index, perplexities=[10, 20]), affinity.MultiscaleMixture(knn_index=knn_index, perplexities=[10, 20]), ]: # Without initilization embedding = TSNE().fit(affinities=aff) self.eval_embedding(embedding, self.y, aff.__class__.__name__) # With initilization embedding = TSNE().fit(affinities=aff, initialization=init) self.eval_embedding(embedding, self.y, aff.__class__.__name__)
def test_affinity_with_precomputed_distances(self): d = squareform(pdist(self.x)) knn_index = nearest_neighbors.PrecomputedDistanceMatrix(d, k=30) init = initialization.random(self.x, random_state=0) for aff in [ affinity.PerplexityBasedNN(knn_index=knn_index, perplexity=30), affinity.Uniform(knn_index=knn_index, k_neighbors=30), affinity.FixedSigmaNN(knn_index=knn_index, sigma=1), affinity.Multiscale(knn_index=knn_index, perplexities=[10, 20]), affinity.MultiscaleMixture(knn_index=knn_index, perplexities=[10, 20]), ]: # Without initilization embedding = TSNE().fit(affinities=aff) self.eval_embedding(embedding, aff.__class__.__name__) # With initilization embedding = TSNE().fit(affinities=aff, initialization=init) self.eval_embedding(embedding, aff.__class__.__name__)
def art_of_tsne(X: np.ndarray, metric: Union[str, Callable] = "euclidean", exaggeration: float = -1, perplexity: int = 30, n_jobs: int = -1) -> TSNEEmbedding: """ Implementation of Dmitry Kobak and Philipp Berens "The art of using t-SNE for single-cell transcriptomics" based on openTSNE. See https://doi.org/10.1038/s41467-019-13056-x | www.nature.com/naturecommunications Args: X The data matrix of shape (n_cells, n_genes) i.e. (n_samples, n_features) metric Any metric allowed by PyNNDescent (default: 'euclidean') exaggeration The exaggeration to use for the embedding perplexity The perplexity to use for the embedding Returns: The embedding as an opentsne.TSNEEmbedding object (which can be cast to an np.ndarray) """ n = X.shape[0] if n > 100_000: if exaggeration == -1: exaggeration = 1 + n / 333_333 # Subsample, optimize, then add the remaining cells and optimize again # Also, use exaggeration == 4 logging.info(f"Creating subset of {n // 40} elements") # Subsample and run a regular art_of_tsne on the subset indices = np.random.permutation(n) reverse = np.argsort(indices) X_sample, X_rest = X[indices[:n // 40]], X[indices[n // 40:]] logging.info(f"Embedding subset") Z_sample = art_of_tsne(X_sample) logging.info( f"Preparing partial initial embedding of the {n - n // 40} remaining elements" ) if isinstance(Z_sample.affinities, affinity.Multiscale): rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexities=[1 / 3, 1 / 3]) else: rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexity=1 / 3) logging.info(f"Combining the initial embeddings, and standardizing") init_full = np.vstack((Z_sample, rest_init))[reverse] init_full = init_full / (np.std(init_full[:, 0]) * 10000) logging.info(f"Creating multiscale affinities") affinities = affinity.PerplexityBasedNN(X, perplexity=perplexity, metric=metric, method="approx", n_jobs=n_jobs) logging.info(f"Creating TSNE embedding") Z = TSNEEmbedding(init_full, affinities, negative_gradient_method="fft", n_jobs=n_jobs) logging.info(f"Optimizing, stage 1") Z.optimize(n_iter=250, inplace=True, exaggeration=12, momentum=0.5, learning_rate=n / 12, n_jobs=n_jobs) logging.info(f"Optimizing, stage 2") Z.optimize(n_iter=750, inplace=True, exaggeration=exaggeration, momentum=0.8, learning_rate=n / 12, n_jobs=n_jobs) elif n > 3_000: if exaggeration == -1: exaggeration = 1 # Use multiscale perplexity affinities_multiscale_mixture = affinity.Multiscale( X, perplexities=[perplexity, n / 100], metric=metric, method="approx", n_jobs=n_jobs) init = initialization.pca(X) Z = TSNEEmbedding(init, affinities_multiscale_mixture, negative_gradient_method="fft", n_jobs=n_jobs) Z.optimize(n_iter=250, inplace=True, exaggeration=12, momentum=0.5, learning_rate=n / 12, n_jobs=n_jobs) Z.optimize(n_iter=750, inplace=True, exaggeration=exaggeration, momentum=0.8, learning_rate=n / 12, n_jobs=n_jobs) else: if exaggeration == -1: exaggeration = 1 # Just a plain TSNE with high learning rate lr = max(200, n / 12) aff = affinity.PerplexityBasedNN(X, perplexity=perplexity, metric=metric, method="approx", n_jobs=n_jobs) init = initialization.pca(X) Z = TSNEEmbedding(init, aff, learning_rate=lr, n_jobs=n_jobs, negative_gradient_method="fft") Z.optimize(250, exaggeration=12, momentum=0.5, inplace=True, n_jobs=n_jobs) Z.optimize(750, exaggeration=exaggeration, momentum=0.8, inplace=True, n_jobs=n_jobs) return Z
total = len(ctgs) profile = np.ones((total, 256), dtype=np.float) i = 0 for ctg in ctgs: raw = fp[ctg][:] norm = raw * 256.0 / sum(raw) profile[0, ] = norm ## 2. Run t-SNE seed = 211 threads = int(cores) affinities_multiscale_mixture = affinity.Multiscale( profile, perplexities=[30, 300], metric="cosine", n_jobs=threads, random_state=seed, ) init = initialization.pca(profile, random_state=seed) embedding = TSNEEmbedding( init, affinities_multiscale_mixture, negative_gradient_method="fft", n_jobs=threads, ) embedding1 = embedding.optimize(n_iter=250, exaggeration=6, momentum=0.5) embedding2 = embedding1.optimize(n_iter=750, exaggeration=1, momentum=0.8)