示例#1
0
    def __init__(self, data, k_min=4, clusters=5):
        self._data = data

        #save the names
        snames = pd.DataFrame({'samples': data.columns})
        snames['id'] = [str(uuid.uuid4()) for x in range(0, snames.shape[0])]
        self.snames = snames

        #get the clusters
        cd = data.corr()
        acd = cluster_ordered_agglomerative(cd, clusters)
        # add the k nearest neighbors from the tsne every point to its cluster point
        tcd = TSNE(2).fit_transform(cd)
        tcd = pd.DataFrame(tcd)
        tcd.columns = ['x', 'y']
        tcd.index = cd.columns
        nns = pd.DataFrame(
            NearestNeighbors(n_neighbors=k_min +
                             1).fit(tcd).kneighbors_graph(tcd).toarray())
        nns.columns = cd.columns.copy()
        nns.columns.name = 'sample_2'
        nns.index = cd.columns.copy()
        nns.index.name = 'sample_1'
        nns = nns.unstack().reset_index().rename(columns={0: 'match'})
        nns = nns[nns['match'] != 0]
        nns = nns[nns['sample_1'] != nns['sample_2']]

        ## For each cluster samples add in nn samples
        clusters = {}
        for cluster_id in acd['cluster_id'].unique():
            #print(cluster_id)
            clusters[cluster_id] = set(
                acd[acd['cluster_id'] == cluster_id].index)
            for member in list(clusters[cluster_id]):
                #print(member)
                ## add each points nearest neighbors
                ns = list(nns.loc[nns['sample_1'] == member, 'sample_2'])
                clusters[cluster_id] |= set(ns)
        self.clusters = clusters
        models = {}
        for cluster_id in self.clusters:
            models[cluster_id] = data[list(clusters[cluster_id])].\
                 apply(lambda x: pd.Series(OrderedDict(zip(
                     ['min','max','mean','std','values'],
                     [np.min(x),np.max(x),np.mean(x),np.std(x)]+[list(x)]
            ))),1)
        self.models = models
# 第 1 主成分と第 2 主成分の散布図 (転移温度の値でサンプルに色付け)
plt.scatter(score.iloc[:, 0],
            score.iloc[:, 1],
            c=dataset.iloc[:, 86],
            cmap=plt.get_cmap('jet'))
plt.colorbar()
plt.xlabel('t_1 (PCA)')
plt.ylabel('t_2 (PCA)')
plt.show()

# t-SNE
selected_indexes = np.arange(0, autoscaled_x.shape[0], 5)
autoscaled_x = autoscaled_x.iloc[selected_indexes, :]
t = TSNE(perplexity=perplexity, n_components=2, init='pca',
         random_state=10).fit_transform(autoscaled_x)
t = pd.DataFrame(t,
                 index=autoscaled_x.index,
                 columns=['t_1 (t-SNE)', 't_2 (t-SNE)'])
t.columns = ['t_1 (t-SNE)', 't_2 (t-SNE)']
t.to_csv('tsne_t.csv')
# t1 と t2 の散布図 (転移温度の値でサンプルに色付け)
plt.rcParams['font.size'] = 18
plt.scatter(t.iloc[:, 0],
            t.iloc[:, 1],
            c=dataset.iloc[selected_indexes, 86],
            cmap=plt.get_cmap('jet'))
plt.colorbar()
plt.xlabel('t_1 (t-SNE)')
plt.ylabel('t_2 (t-SNE)')
plt.show()
#train_topics=get_top_topics(nmf, spleen.index, n_top_words)
#print_top_words(nmf, spleen.index, n_top_words)

##-----------------------------------------------------------------------------
##visualization
pca = PCA(n_components=2)
embedding = pca.fit_transform(topic_matrix)
embedding = pd.DataFrame(embedding)
embedding.columns=['PC1','PC2']
f=sns.lmplot(x='PC1', y='PC2',data=embedding,
           fit_reg=False,legend=False,scatter_kws={"s": 5})
f.savefig("pca",dpi=300)

embedding = TSNE(n_components=2).fit_transform(topic_matrix)
embedding = pd.DataFrame(embedding)
embedding.columns=['tSNE1','tSNE2']
sns.lmplot(x='tSNE1', y='tSNE2',data=embedding,
           fit_reg=False,legend=False,scatter_kws={"s": 5,"color":"red"})
plt.savefig("tsne",dpi=300)

embedding = umap.UMAP(n_neighbors=5,
                      min_dist=0.3,
                      metric='correlation').fit_transform(topic_matrix)
embedding = pd.DataFrame(embedding)
embedding.columns=['UMAP1','UMAP2']
sns.lmplot(x='UMAP1', y='UMAP2',data=embedding,
           fit_reg=False,legend=False,scatter_kws={"s": 5,"color":"green"})
plt.savefig("UMAP",dpi=300)

##-----------------------------------------------------------------------------
##clustering