Exemplo n.º 1
0
    def calculate_TSNE(self,
                       indecies=None,
                       sample=5000,
                       n_jobs=1,
                       multicore=False,
                       **kwargs):
        """
        Calculate the TSNE and store it in the h5 object

        Args:
            indecies (list): select a subset based on indecies
            sample (int): number of cells to downsample to if None use them all
            n_jobs (int): number of cpus to use (if multicore is True)
            multicore (bool): use the MultiCoreTSNE package
            **kwargs: pass any other arguments to TSNE
        """
        if not self.mode in ['w', 'r+', 'a']:
            raise ValueError('cant write for readonly')
        dsdata = self.fractions.copy()
        if indecies is not None:
            dsdata = dsdata.loc[indecies]
        if sample is not None:
            if dsdata.shape[0] < sample: sample = dsdata.shape[0]
            dsdata = dsdata.sample(n=sample)
        if self.verbose:
            sys.stderr.write("executing TSNE decomposition on " +
                             str(dsdata.shape[0]) + " cells\n")
        tsne = None
        if multicore:
            from MulticoreTSNE import MulticoreTSNE as mTSNE
            if self.verbose: sys.stderr.write("Using MulticoreTSNE\n")
            tsne = mTSNE(n_jobs=n_jobs, **kwargs).fit_transform(dsdata)
        else:
            if self.verbose: sys.stderr.write("Using sklearn TSNE\n")
            tsne = TSNE(**kwargs).fit_transform(dsdata)
        tsne = pd.DataFrame(tsne, columns=['x', 'y'])
        tsne.index = dsdata.index
        tsne = tsne.reset_index().merge(
            self.clusters.reset_index()[['cluster_id', 'k', 'db_id'] +
                                        self.groupby],
            on=['db_id'])
        tsne['cluster_name'] = tsne['cluster_id'].astype(str)
        self.tsne = tsne
Exemplo n.º 2
0
    def calculate_tsne(self, path, p, s_size=0, force_overwrite=False):
        """
        Function applied to RunStats object to calculate dimensionality reduction using TSNE

        :param path: Results path
        :param p:
        :param s_size:
        :param force_overwrite: (default: False) Overrides already existing results
        :return:
        """
        m, c_ind, r_ind = self.dt_matrix(path, s_size)
        results_path = f"{path}/run_{self.pk}_s_{s_size}_p_{p}_results.npy"
        if os.path.exists(results_path):
            tsne_results = np.load(results_path)
            if not force_overwrite:
                print("We've already calculated the tsne positions")
                return tsne_results, r_ind
        tsne = mTSNE(n_components=2, verbose=0, perplexity=p, n_jobs=4)
        tsne_results = tsne.fit_transform(m.toarray())
        np.save(results_path, tsne_results)
        return tsne_results, r_ind
Exemplo n.º 3
0
def run_tsne(data,
             return_tsne=True,
             tsne_plot=True,
             n_jobs=1,
             n_components=2,
             perplexity=40,
             learning_rate=200):
    assert isinstance(
        data, AnnData), 'Required input should be an AnnData object from desc'
    assert hasattr(data.obsm,
                   'embedded'), 'The embedded matrix not found, run desc first'
    tsne = mTSNE(n_jobs=n_jobs, n_components=n_components, perplexity=perplexity, learning_rate=learning_rate). \
        fit_transform(data.obsm['embedded'])
    data.obsm['tsne'] = tsne

    if tsne_plot:
        plt.scatter(tsne[:, 0], tsne[:, 1], s=0.1, c=data.obs['ident'])
        plt.title("tSNE plot")
        plt.xlabel("tSNE_1")
        plt.ylabel("tSNE_2")

    return tsne if return_tsne else None
Exemplo n.º 4
0
    for s_size in [0]:
        print(s_size)
        m, c_ind, r_ind = get_matrix(run_id, s_size)
        print("got m")
        np.save(
            '../tsne_results/data/run_{}_s_{}_m.npy'.format(run_id, s_size), m)
        np.save(
            '../tsne_results/data/run_{}_s_{}_r_ind.npy'.format(
                run_id, s_size), r_ind)
        #for p in [40,50,60,70,90,150]:
        for p in [20, 50, 100, 200]:
            fname = "../tsne_results/plots/run_{}_s_{}_p_{}.png".format(
                run_id, s_size, p)
            print("Doing tsne with run {}, on {} docs, with {} perplexity".
                  format(run_id, s_size, p))
            f = Path(fname)
            if f.exists():
                print("EXISTS")
                continue

            print("multicore")
            t0 = time()
            tsne = mTSNE(n_components=2, verbose=0, perplexity=p, n_jobs=4)
            tsne_results = tsne.fit_transform(m.toarray())
            print("done in %0.3fs." % (time() - t0))

            np.save(
                '../tsne_results/data/run_{}_s_{}_p{}.npy'.format(
                    run_id, s_size, p), tsne_results)
            draw_simple(tsne_results, r_ind, fname)
Exemplo n.º 5
0
# test_pca = pca.fit_transform(x_test)

ld = LDA(n_components=50)
train_pca = ld.fit_transform(x_train, y_train)
test_pca = ld.fit_transform(x_test, y_test)

b_size = 10000
# inc_pca = decomp.IncrementalPCA(n_components=50, copy=False, batch_size=b_size)
# train_pca = inc_pca.fit_transform(x_train)
# test_pca = inc_pca.fit_transform(x_test)

# tsne = TSNE(n_components=2, perplexity=40, n_iter=300, random_state=42)
# train_tsne = tsne.fit_transform(train_pca)
# test_tsne = tsne.fit_transform(test_pca)

tsne = mTSNE(n_components=2, perplexity=40, n_iter=300, n_jobs=-1)
train_tsne = tsne.fit_transform(train_pca)
test_tsne = tsne.fit_transform(test_pca)

# kmeans = cluster.KMeans(n_clusters=10, n_jobs=-1, algorithm="full")
# kmeans.fit(x_train)

kmeans = cluster.MiniBatchKMeans(n_clusters=10, max_iter=300, batch_size=b_size).fit(train_tsne)

test_predict = kmeans.fit_predict(test_tsne)
labels = np.zeros_like(test_predict)
for i in range(10):
    mask = (test_predict == i)
    labels[mask] = mode(y_test[mask])[0]
test_acc = metrics.accuracy_score(y_test, labels)
print(test_acc)