def calculate_TSNE(self, indecies=None, sample=5000, n_jobs=1, multicore=False, **kwargs): """ Calculate the TSNE and store it in the h5 object Args: indecies (list): select a subset based on indecies sample (int): number of cells to downsample to if None use them all n_jobs (int): number of cpus to use (if multicore is True) multicore (bool): use the MultiCoreTSNE package **kwargs: pass any other arguments to TSNE """ if not self.mode in ['w', 'r+', 'a']: raise ValueError('cant write for readonly') dsdata = self.fractions.copy() if indecies is not None: dsdata = dsdata.loc[indecies] if sample is not None: if dsdata.shape[0] < sample: sample = dsdata.shape[0] dsdata = dsdata.sample(n=sample) if self.verbose: sys.stderr.write("executing TSNE decomposition on " + str(dsdata.shape[0]) + " cells\n") tsne = None if multicore: from MulticoreTSNE import MulticoreTSNE as mTSNE if self.verbose: sys.stderr.write("Using MulticoreTSNE\n") tsne = mTSNE(n_jobs=n_jobs, **kwargs).fit_transform(dsdata) else: if self.verbose: sys.stderr.write("Using sklearn TSNE\n") tsne = TSNE(**kwargs).fit_transform(dsdata) tsne = pd.DataFrame(tsne, columns=['x', 'y']) tsne.index = dsdata.index tsne = tsne.reset_index().merge( self.clusters.reset_index()[['cluster_id', 'k', 'db_id'] + self.groupby], on=['db_id']) tsne['cluster_name'] = tsne['cluster_id'].astype(str) self.tsne = tsne
def calculate_tsne(self, path, p, s_size=0, force_overwrite=False): """ Function applied to RunStats object to calculate dimensionality reduction using TSNE :param path: Results path :param p: :param s_size: :param force_overwrite: (default: False) Overrides already existing results :return: """ m, c_ind, r_ind = self.dt_matrix(path, s_size) results_path = f"{path}/run_{self.pk}_s_{s_size}_p_{p}_results.npy" if os.path.exists(results_path): tsne_results = np.load(results_path) if not force_overwrite: print("We've already calculated the tsne positions") return tsne_results, r_ind tsne = mTSNE(n_components=2, verbose=0, perplexity=p, n_jobs=4) tsne_results = tsne.fit_transform(m.toarray()) np.save(results_path, tsne_results) return tsne_results, r_ind
def run_tsne(data, return_tsne=True, tsne_plot=True, n_jobs=1, n_components=2, perplexity=40, learning_rate=200): assert isinstance( data, AnnData), 'Required input should be an AnnData object from desc' assert hasattr(data.obsm, 'embedded'), 'The embedded matrix not found, run desc first' tsne = mTSNE(n_jobs=n_jobs, n_components=n_components, perplexity=perplexity, learning_rate=learning_rate). \ fit_transform(data.obsm['embedded']) data.obsm['tsne'] = tsne if tsne_plot: plt.scatter(tsne[:, 0], tsne[:, 1], s=0.1, c=data.obs['ident']) plt.title("tSNE plot") plt.xlabel("tSNE_1") plt.ylabel("tSNE_2") return tsne if return_tsne else None
for s_size in [0]: print(s_size) m, c_ind, r_ind = get_matrix(run_id, s_size) print("got m") np.save( '../tsne_results/data/run_{}_s_{}_m.npy'.format(run_id, s_size), m) np.save( '../tsne_results/data/run_{}_s_{}_r_ind.npy'.format( run_id, s_size), r_ind) #for p in [40,50,60,70,90,150]: for p in [20, 50, 100, 200]: fname = "../tsne_results/plots/run_{}_s_{}_p_{}.png".format( run_id, s_size, p) print("Doing tsne with run {}, on {} docs, with {} perplexity". format(run_id, s_size, p)) f = Path(fname) if f.exists(): print("EXISTS") continue print("multicore") t0 = time() tsne = mTSNE(n_components=2, verbose=0, perplexity=p, n_jobs=4) tsne_results = tsne.fit_transform(m.toarray()) print("done in %0.3fs." % (time() - t0)) np.save( '../tsne_results/data/run_{}_s_{}_p{}.npy'.format( run_id, s_size, p), tsne_results) draw_simple(tsne_results, r_ind, fname)
# test_pca = pca.fit_transform(x_test) ld = LDA(n_components=50) train_pca = ld.fit_transform(x_train, y_train) test_pca = ld.fit_transform(x_test, y_test) b_size = 10000 # inc_pca = decomp.IncrementalPCA(n_components=50, copy=False, batch_size=b_size) # train_pca = inc_pca.fit_transform(x_train) # test_pca = inc_pca.fit_transform(x_test) # tsne = TSNE(n_components=2, perplexity=40, n_iter=300, random_state=42) # train_tsne = tsne.fit_transform(train_pca) # test_tsne = tsne.fit_transform(test_pca) tsne = mTSNE(n_components=2, perplexity=40, n_iter=300, n_jobs=-1) train_tsne = tsne.fit_transform(train_pca) test_tsne = tsne.fit_transform(test_pca) # kmeans = cluster.KMeans(n_clusters=10, n_jobs=-1, algorithm="full") # kmeans.fit(x_train) kmeans = cluster.MiniBatchKMeans(n_clusters=10, max_iter=300, batch_size=b_size).fit(train_tsne) test_predict = kmeans.fit_predict(test_tsne) labels = np.zeros_like(test_predict) for i in range(10): mask = (test_predict == i) labels[mask] = mode(y_test[mask])[0] test_acc = metrics.accuracy_score(y_test, labels) print(test_acc)