def __init__(self, data, k_min=4, clusters=5): self._data = data #save the names snames = pd.DataFrame({'samples': data.columns}) snames['id'] = [str(uuid.uuid4()) for x in range(0, snames.shape[0])] self.snames = snames #get the clusters cd = data.corr() acd = cluster_ordered_agglomerative(cd, clusters) # add the k nearest neighbors from the tsne every point to its cluster point tcd = TSNE(2).fit_transform(cd) tcd = pd.DataFrame(tcd) tcd.columns = ['x', 'y'] tcd.index = cd.columns nns = pd.DataFrame( NearestNeighbors(n_neighbors=k_min + 1).fit(tcd).kneighbors_graph(tcd).toarray()) nns.columns = cd.columns.copy() nns.columns.name = 'sample_2' nns.index = cd.columns.copy() nns.index.name = 'sample_1' nns = nns.unstack().reset_index().rename(columns={0: 'match'}) nns = nns[nns['match'] != 0] nns = nns[nns['sample_1'] != nns['sample_2']] ## For each cluster samples add in nn samples clusters = {} for cluster_id in acd['cluster_id'].unique(): #print(cluster_id) clusters[cluster_id] = set( acd[acd['cluster_id'] == cluster_id].index) for member in list(clusters[cluster_id]): #print(member) ## add each points nearest neighbors ns = list(nns.loc[nns['sample_1'] == member, 'sample_2']) clusters[cluster_id] |= set(ns) self.clusters = clusters models = {} for cluster_id in self.clusters: models[cluster_id] = data[list(clusters[cluster_id])].\ apply(lambda x: pd.Series(OrderedDict(zip( ['min','max','mean','std','values'], [np.min(x),np.max(x),np.mean(x),np.std(x)]+[list(x)] ))),1) self.models = models
def calculate_TSNE(self, indecies=None, sample=5000, n_jobs=1, multicore=False, **kwargs): """ Calculate the TSNE and store it in the h5 object Args: indecies (list): select a subset based on indecies sample (int): number of cells to downsample to if None use them all n_jobs (int): number of cpus to use (if multicore is True) multicore (bool): use the MultiCoreTSNE package **kwargs: pass any other arguments to TSNE """ if not self.mode in ['w', 'r+', 'a']: raise ValueError('cant write for readonly') dsdata = self.fractions.copy() if indecies is not None: dsdata = dsdata.loc[indecies] if sample is not None: if dsdata.shape[0] < sample: sample = dsdata.shape[0] dsdata = dsdata.sample(n=sample) if self.verbose: sys.stderr.write("executing TSNE decomposition on " + str(dsdata.shape[0]) + " cells\n") tsne = None if multicore: from MulticoreTSNE import MulticoreTSNE as mTSNE if self.verbose: sys.stderr.write("Using MulticoreTSNE\n") tsne = mTSNE(n_jobs=n_jobs, **kwargs).fit_transform(dsdata) else: if self.verbose: sys.stderr.write("Using sklearn TSNE\n") tsne = TSNE(**kwargs).fit_transform(dsdata) tsne = pd.DataFrame(tsne, columns=['x', 'y']) tsne.index = dsdata.index tsne = tsne.reset_index().merge( self.clusters.reset_index()[['cluster_id', 'k', 'db_id'] + self.groupby], on=['db_id']) tsne['cluster_name'] = tsne['cluster_id'].astype(str) self.tsne = tsne
# t-SNE # k3n-error を用いた perplexity の最適化 k3n_errors = [] for index, perplexity in enumerate(candidates_of_perplexity): print(index + 1, '/', len(candidates_of_perplexity)) t = TSNE(perplexity=perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x) scaled_t = (t - t.mean(axis=0)) / t.std(axis=0, ddof=1) k3n_errors.append( sample_functions.k3n_error(autoscaled_x, scaled_t, k_in_k3n_error) + sample_functions.k3n_error( scaled_t, autoscaled_x, k_in_k3n_error)) plt.rcParams['font.size'] = 18 plt.scatter(candidates_of_perplexity, k3n_errors, c='blue') plt.xlabel("perplexity") plt.ylabel("k3n-errors") plt.show() optimal_perplexity = candidates_of_perplexity[np.where(k3n_errors == np.min(k3n_errors))[0][0]] print('\nk3n-error による perplexity の最適値 :', optimal_perplexity) # t-SNE t = TSNE(perplexity=optimal_perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x) t = pd.DataFrame(t) t.index = x.index t.columns = ['t_1 (t-SNE)', 't_2 (t-SNE)'] t.to_csv('tsne_t.csv') # t1 と t2 の散布図 (y の値でサンプルに色付け) plt.rcParams['font.size'] = 18 plt.scatter(t.iloc[:, 0], t.iloc[:, 1], color='blue') plt.xlabel('t_1 (t-SNE)') plt.ylabel('t_2 (t-SNE)') plt.show()