def analyze_zN(z, outdir, vg, skip_umap=False): zdim = z.shape[1] # Principal component analysis log('Perfoming principal component analysis...') pc, pca = analysis.run_pca(z) start, end = np.percentile(pc[:,0],(5,95)) z_pc1 = analysis.get_pc_traj(pca, z.shape[1], 10, 1, start, end) start, end = np.percentile(pc[:,1],(5,95)) z_pc2 = analysis.get_pc_traj(pca, z.shape[1], 10, 2, start, end) # kmeans clustering log('K-means clustering...') K = 20 kmeans_labels, centers = analysis.cluster_kmeans(z, K) centers, centers_ind = analysis.get_nearest_point(z, centers) if not os.path.exists(f'{outdir}/kmeans20'): os.mkdir(f'{outdir}/kmeans20') utils.save_pkl(kmeans_labels, f'{outdir}/kmeans20/labels.pkl') np.savetxt(f'{outdir}/kmeans20/centers.txt', centers) np.savetxt(f'{outdir}/kmeans20/centers_ind.txt', centers_ind, fmt='%d') # Generate volumes log('Generating volumes...') vg.gen_volumes(f'{outdir}/pc1', z_pc1) vg.gen_volumes(f'{outdir}/pc2', z_pc2) vg.gen_volumes(f'{outdir}/kmeans20', centers) # UMAP -- slow step if zdim > 2 and not skip_umap: log('Running UMAP...') umap_emb = analysis.run_umap(z) utils.save_pkl(umap_emb, f'{outdir}/umap.pkl') # Make some plots log('Generating plots...') plt.figure(1) plt.scatter(pc[:,0], pc[:,1], alpha=.1, s=2) plt.xlabel('PC1') plt.ylabel('PC2') plt.savefig(f'{outdir}/z_pca.png') if zdim > 2 and not skip_umap: plt.figure(2) plt.scatter(umap_emb[:,0], umap_emb[:,1], alpha=.1, s=2) plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.savefig(f'{outdir}/umap.png') analysis.plot_by_cluster(pc[:,0], pc[:,1], K, kmeans_labels, centers_ind=centers_ind, annotate=True) plt.xlabel('PC1') plt.ylabel('PC2') plt.savefig(f'{outdir}/kmeans20/z_pca.png') if zdim > 2 and not skip_umap: analysis.plot_by_cluster(umap_emb[:,0], umap_emb[:,1], K, kmeans_labels, centers_ind=centers_ind, annotate=True) plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.savefig(f'{outdir}/kmeans20/umap.png')
def generate_volumes(z, outdir, vg, K): # kmeans clustering log('Sketching distribution...') kmeans_labels, centers = analysis.cluster_kmeans(z, K, on_data=True, reorder=True) centers, centers_ind = analysis.get_nearest_point(z, centers) if not os.path.exists(f'{outdir}/kmeans{K}'): os.mkdir(f'{outdir}/kmeans{K}') utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl') np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers) np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d') log('Generating volumes...') vg.gen_volumes(f'{outdir}/kmeans{K}', centers)
def main(args): fig, ax = plt.subplots() print(args) z = pickle.load(open(args.input, 'rb')) if args.stride: z = z[::args.stride] print('{} points'.format(len(z))) # k-means clustering labels, centers = analysis.cluster_kmeans(z, args.k, on_data=args.on_data, reorder=args.reorder) # use the nearest data point instead of cluster centroid if args.on_data: centers_zi = cdist(centers, z).argmin(axis=1) print(centers_zi) centers_z = z[centers_zi] centers = centers_z if args.out_k_ind: np.savetxt(args.out_k_ind, centers_zi, fmt='%d') if args.o: with open(args.o, 'wb') as f: pickle.dump(labels, f) if args.out_k: np.savetxt(args.out_k, centers) # dimensionality reduction for viz pca = PCA(z.shape[1]) pca.fit(z) print('PCA explained variance ratio:') print(pca.explained_variance_ratio_) pc = pca.transform(z) for i in range(args.k): ii = np.where(labels == i) pc_sub = pc[ii] plt.scatter(pc_sub[:, 0], pc_sub[:, 1], s=2, alpha=0.1, label='cluster {}'.format(i)) c = pca.transform(centers) plt.scatter(c[:, 0], c[:, 1], c='k') for i in range(args.k): ax.annotate(str(i), c[i, 0:2]) xx, yy = 0, 1 plt.xlabel('PC{} ({:3f})'.format(xx + 1, pca.explained_variance_ratio_[xx])) plt.ylabel('PC{} ({:3f})'.format(yy + 1, pca.explained_variance_ratio_[yy])) if args.out_png: plt.savefig(args.out_png) else: plt.show()
def analyze_zN(z, outdir, vg, skip_umap=False, num_pcs=2, num_ksamples=20): zdim = z.shape[1] # Principal component analysis log('Perfoming principal component analysis...') pc, pca = analysis.run_pca(z) log('Generating volumes...') for i in range(num_pcs): start, end = np.percentile(pc[:, i], (5, 95)) z_pc = analysis.get_pc_traj(pca, z.shape[1], 10, i + 1, start, end) vg.gen_volumes(f'{outdir}/pc{i+1}', z_pc) # kmeans clustering log('K-means clustering...') K = num_ksamples kmeans_labels, centers = analysis.cluster_kmeans(z, K) centers, centers_ind = analysis.get_nearest_point(z, centers) if not os.path.exists(f'{outdir}/kmeans{K}'): os.mkdir(f'{outdir}/kmeans{K}') utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl') np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers) np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d') log('Generating volumes...') vg.gen_volumes(f'{outdir}/kmeans{K}', centers) # UMAP -- slow step if zdim > 2 and not skip_umap: log('Running UMAP...') umap_emb = analysis.run_umap(z) utils.save_pkl(umap_emb, f'{outdir}/umap.pkl') # Make some plots log('Generating plots...') plt.figure(1) g = sns.jointplot(x=pc[:, 0], y=pc[:, 1], alpha=.1, s=2) g.set_axis_labels('PC1', 'PC2') plt.tight_layout() plt.savefig(f'{outdir}/z_pca.png') plt.figure(2) g = sns.jointplot(x=pc[:, 0], y=pc[:, 1], kind='hex') g.set_axis_labels('PC1', 'PC2') plt.tight_layout() plt.savefig(f'{outdir}/z_pca_hexbin.png') if zdim > 2 and not skip_umap: plt.figure(3) g = sns.jointplot(x=umap_emb[:, 0], y=umap_emb[:, 1], alpha=.1, s=2) g.set_axis_labels('UMAP1', 'UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/umap.png') plt.figure(4) g = sns.jointplot(x=umap_emb[:, 0], y=umap_emb[:, 1], kind='hex') g.set_axis_labels('UMAP1', 'UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/umap_hexbin.png') analysis.scatter_annotate(pc[:, 0], pc[:, 1], centers_ind=centers_ind, annotate=True) plt.xlabel('PC1') plt.ylabel('PC2') plt.savefig(f'{outdir}/kmeans{K}/z_pca.png') g = analysis.scatter_annotate_hex(pc[:, 0], pc[:, 1], centers_ind=centers_ind, annotate=True) g.set_axis_labels('PC1', 'PC2') plt.tight_layout() plt.savefig(f'{outdir}/kmeans{K}/z_pca_hex.png') if zdim > 2 and not skip_umap: analysis.scatter_annotate(umap_emb[:, 0], umap_emb[:, 1], centers_ind=centers_ind, annotate=True) plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.savefig(f'{outdir}/kmeans{K}/umap.png') g = analysis.scatter_annotate_hex(umap_emb[:, 0], umap_emb[:, 1], centers_ind=centers_ind, annotate=True) g.set_axis_labels('UMAP1', 'UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/kmeans{K}/umap_hex.png') for i in range(num_pcs): if not skip_umap: analysis.scatter_color(umap_emb[:, 0], umap_emb[:, 1], pc[:, i], label=f'PC{i+1}') plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/pc{i+1}/umap.png')