Пример #1
0
    def __init__(self, data, k_min=4, clusters=5):
        self._data = data

        #save the names
        snames = pd.DataFrame({'samples': data.columns})
        snames['id'] = [str(uuid.uuid4()) for x in range(0, snames.shape[0])]
        self.snames = snames

        #get the clusters
        cd = data.corr()
        acd = cluster_ordered_agglomerative(cd, clusters)
        # add the k nearest neighbors from the tsne every point to its cluster point
        tcd = TSNE(2).fit_transform(cd)
        tcd = pd.DataFrame(tcd)
        tcd.columns = ['x', 'y']
        tcd.index = cd.columns
        nns = pd.DataFrame(
            NearestNeighbors(n_neighbors=k_min +
                             1).fit(tcd).kneighbors_graph(tcd).toarray())
        nns.columns = cd.columns.copy()
        nns.columns.name = 'sample_2'
        nns.index = cd.columns.copy()
        nns.index.name = 'sample_1'
        nns = nns.unstack().reset_index().rename(columns={0: 'match'})
        nns = nns[nns['match'] != 0]
        nns = nns[nns['sample_1'] != nns['sample_2']]

        ## For each cluster samples add in nn samples
        clusters = {}
        for cluster_id in acd['cluster_id'].unique():
            #print(cluster_id)
            clusters[cluster_id] = set(
                acd[acd['cluster_id'] == cluster_id].index)
            for member in list(clusters[cluster_id]):
                #print(member)
                ## add each points nearest neighbors
                ns = list(nns.loc[nns['sample_1'] == member, 'sample_2'])
                clusters[cluster_id] |= set(ns)
        self.clusters = clusters
        models = {}
        for cluster_id in self.clusters:
            models[cluster_id] = data[list(clusters[cluster_id])].\
                 apply(lambda x: pd.Series(OrderedDict(zip(
                     ['min','max','mean','std','values'],
                     [np.min(x),np.max(x),np.mean(x),np.std(x)]+[list(x)]
            ))),1)
        self.models = models
Пример #2
0
    def calculate_TSNE(self,
                       indecies=None,
                       sample=5000,
                       n_jobs=1,
                       multicore=False,
                       **kwargs):
        """
        Calculate the TSNE and store it in the h5 object

        Args:
            indecies (list): select a subset based on indecies
            sample (int): number of cells to downsample to if None use them all
            n_jobs (int): number of cpus to use (if multicore is True)
            multicore (bool): use the MultiCoreTSNE package
            **kwargs: pass any other arguments to TSNE
        """
        if not self.mode in ['w', 'r+', 'a']:
            raise ValueError('cant write for readonly')
        dsdata = self.fractions.copy()
        if indecies is not None:
            dsdata = dsdata.loc[indecies]
        if sample is not None:
            if dsdata.shape[0] < sample: sample = dsdata.shape[0]
            dsdata = dsdata.sample(n=sample)
        if self.verbose:
            sys.stderr.write("executing TSNE decomposition on " +
                             str(dsdata.shape[0]) + " cells\n")
        tsne = None
        if multicore:
            from MulticoreTSNE import MulticoreTSNE as mTSNE
            if self.verbose: sys.stderr.write("Using MulticoreTSNE\n")
            tsne = mTSNE(n_jobs=n_jobs, **kwargs).fit_transform(dsdata)
        else:
            if self.verbose: sys.stderr.write("Using sklearn TSNE\n")
            tsne = TSNE(**kwargs).fit_transform(dsdata)
        tsne = pd.DataFrame(tsne, columns=['x', 'y'])
        tsne.index = dsdata.index
        tsne = tsne.reset_index().merge(
            self.clusters.reset_index()[['cluster_id', 'k', 'db_id'] +
                                        self.groupby],
            on=['db_id'])
        tsne['cluster_name'] = tsne['cluster_id'].astype(str)
        self.tsne = tsne
Пример #3
0
# t-SNE
# k3n-error を用いた perplexity の最適化 
k3n_errors = []
for index, perplexity in enumerate(candidates_of_perplexity):
    print(index + 1, '/', len(candidates_of_perplexity))
    t = TSNE(perplexity=perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x)
    scaled_t = (t - t.mean(axis=0)) / t.std(axis=0, ddof=1)

    k3n_errors.append(
        sample_functions.k3n_error(autoscaled_x, scaled_t, k_in_k3n_error) + sample_functions.k3n_error(
            scaled_t, autoscaled_x, k_in_k3n_error))
plt.rcParams['font.size'] = 18
plt.scatter(candidates_of_perplexity, k3n_errors, c='blue')
plt.xlabel("perplexity")
plt.ylabel("k3n-errors")
plt.show()
optimal_perplexity = candidates_of_perplexity[np.where(k3n_errors == np.min(k3n_errors))[0][0]]
print('\nk3n-error による perplexity の最適値 :', optimal_perplexity)
# t-SNE
t = TSNE(perplexity=optimal_perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x)
t = pd.DataFrame(t)
t.index = x.index
t.columns = ['t_1 (t-SNE)', 't_2 (t-SNE)']
t.to_csv('tsne_t.csv')
# t1 と t2 の散布図 (y の値でサンプルに色付け)
plt.rcParams['font.size'] = 18
plt.scatter(t.iloc[:, 0], t.iloc[:, 1], color='blue')
plt.xlabel('t_1 (t-SNE)')
plt.ylabel('t_2 (t-SNE)')
plt.show()