def hc_plot_dendrogram(data, row_colours, mad=None, n_ftr=3000, metric='correlation', **kwargs): """ For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using that many genes (selected in descending MAD order) :param data: Cols are samples, rows are genes (or similar) :param row_colours: As passed to dendrogram routine :param n_gene_arr: The values to test :return: """ if 'fig_kws' not in kwargs: kwargs['fig_kws'] = {'figsize': (5.5, 10)} if mad is None: mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) the_dat = data.loc[mad.index[:n_ftr]] fig_dict = clustering.dendrogram_with_colours(the_dat, row_colours, vertical=False, metric=metric, **kwargs) return fig_dict
def hc_plot_dendrogram_vary_n_gene(data, row_colours, mad=None, n_ftr=(1000, 2000, 3000, 5000, 10000), metric='correlation'): """ For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using that many genes (selected in descending MAD order) :param data: Cols are samples, rows are genes (or similar) :param row_colours: As passed to dendrogram routine :param n_gene_arr: The values to test :return: """ if mad is None: mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) fig_dict = {} for ng in n_ftr: the_dat = data.loc[mad.index[:ng]] d = clustering.dendrogram_with_colours(the_dat, row_colours, fig_kws={'figsize': (5.5, 10)}, vertical=False, metric=metric) fig_dict[ng] = d return fig_dict
def plot_dendrogram(obj_arr, n_by_mad=None, qn_method=None, eps=0.01, min_val=1, n_above_min=3, vertical=False, figsize=(7, 8), **kwargs): if len(obj_arr) > 1: the_obj = loader.MultipleBatchLoader(obj_arr) else: the_obj = obj_arr[0] the_obj = filter_loader(the_obj, min_val=min_val, n_above_min=n_above_min) dat = np.log2(the_obj.data + eps) if qn_method is not None: dat = transformations.quantile_normalisation(dat, method=qn_method) if n_by_mad is not None: mad = transformations.median_absolute_deviation(dat).sort_values( ascending=False) dat = dat.loc[mad.index[:n_by_mad]] cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta) dend = clustering.dendrogram_with_colours(dat, cc, vertical=vertical, legend_labels=leg_dict, fig_kws={'figsize': figsize}, **kwargs) return dend
ax.figure.savefig(os.path.join( outdir, "pca_top%d_by_mad_with_names.png" % n_t), dpi=200) row_colours = pd.DataFrame('gray', index=our_dat.columns, columns=['']) row_colours.loc[row_colours.index.str.contains( r'eNSC[0-9]med')] = '#66c2a5' row_colours.loc[row_colours.index.str.contains( r'eNSC[0-9]mouse')] = '#fc8d62' row_colours.loc[row_colours.index.str.contains( r'mDura.[AN0-9]*mouse')] = '#8da0cb' row_colours.loc[row_colours.index.str.contains( r'mDura.[AN0-9]*human')] = '#e78ac3' for n_t in n_gene_try: fname = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t d = clustering.dendrogram_with_colours(our_dat.loc[mad.index[:n_t]], row_colours, fig_kws={'figsize': (10, 5.5)}) d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200) cm = clustering.plot_clustermap(our_dat.loc[mad.index[:n_t]], cmap='RdBu_r', metric='correlation', col_colors=row_colours) cm.gs.update(bottom=0.2) cm.savefig(os.path.join( outdir, "clustermap_by_gene_corr_log_top%d_by_mad.png" % n_t), dpi=200)
'Cell type'] = '#f6ffaa' # pale yellow row_colours_all.loc[row_colours_all.index.str.contains(r'ESC'), 'Cell type'] = '#8b33dd' # pale purple row_colours_all.loc[row_colours_all.index.str.contains(r'[iI]PSC'), 'Cell type'] = '#8b33dd' # pale purple row_colours_all.loc[row_colours_all.index.str.contains(r'[Mm]icroglia'), 'Cell type'] = '#ffd8af' # pale orange row_colours_all.loc[row_colours_all.index.str.contains(r'Yanez'), 'Cell type'] = '#ffa03d' # orange # these override previously-defined colours row_colours_all.loc[row_colours_all.index.str.contains(r'eNSC[0-9]med'), 'Cell type'] = '#96ff9d' # pale green row_colours_all.loc[row_colours_all.index.str.contains(r'eNSC[0-9]mouse'), 'Cell type'] = '#008408' # dark green row_colours_all.loc[ row_colours_all.index.str.contains(r'mDura.[AN0-9]*mouse'), 'Cell type'] = '#3543ff' # dark blue row_colours_all.loc[ row_colours_all.index.str.contains(r'mDura.[AN0-9]*human'), 'Cell type'] = '#c4c8ff' # pale blue d = clustering.dendrogram_with_colours(dat.loc[mad.index[:3000]], row_colours_all, fig_kws={'figsize': (5.5, 10)}, vertical=False) # cm = clustering.plot_correlation_clustermap( # dat.loc[mad.index[:3000]], # row_colors=row_colours_all, # )
fig.savefig(os.path.join(outdir, "filtering_effect.pdf")) idx = gene_idx & ((obj.data > 10).sum(axis=1) > 10) data = obj.data.loc[idx] mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) logdata = np.log(data + 1) # start with a dendrogram col_colours = clustering.generate_colour_map_dict(obj.meta, 'sample', matches, label='Patient', non_matching='gray') out = clustering.dendrogram_with_colours(logdata, col_colours=col_colours, vertical=False) dist = clustering.dendrogram_threshold_by_nclust(out['linkage'], 3) out['dendrogram_ax'].axvline(dist, ls='--', c='gray') out['fig'].savefig(os.path.join(outdir, "dendrogram_all_genes.png"), dpi=200) out['fig'].savefig(os.path.join(outdir, "dendrogram_all_genes.pdf")) # repeat but now only use N genes (by MAD) # tested and the result is unchanged for most values in the region [500, 5000] n_gene = 1500 out = clustering.dendrogram_with_colours(logdata.loc[mad.index[:n_gene]], col_colours=col_colours, vertical=False) dist = clustering.dendrogram_threshold_by_nclust(out['linkage'], 3) out['dendrogram_ax'].axvline(dist, ls='--', c='gray')
r'Fibroblast')] = '#fff89e' row_colours_all.loc[row_colours_all.index.str.contains( r'Fetal')] = 'yellow' row_colours_all.loc[row_colours_all.index.str.contains('ES1')] = '#ff7777' row_colours_all.loc[row_colours_all.index.str.contains('PSC')] = '#ff7777' row_colours_all.loc[row_colours_all.index.str.contains( r'DURA[0-9]*_NSC')] = '#7fc97f' # green row_colours_all.loc[row_colours_all.index.str.contains( r'DURA[0-9]*_IPSC')] = '#fdc086' # orange n_gene = 3000 fname = "hier_clust_by_gene_log_corr_top%d_by_mad.{ext}" % n_gene cm, mad_all = cluster_logdata_with_threshold(abg, n=n_gene, eps=eps, col_colors=row_colours_all) cm.gs.update(bottom=0.3) cm.savefig(os.path.join(outdir, fname.format(ext='png')), dpi=300) cm.savefig(os.path.join(outdir, fname.format(ext='tiff')), dpi=200) fname = "hier_clust_dendrogram_log_corr_top%d_by_mad.{ext}" % n_gene d = clustering.dendrogram_with_colours( abg_log.loc[amad_log.index[:n_gene]], row_colours_all, fig_kws={'figsize': (5.5, 10)}, vertical=False) d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=300) d['fig'].savefig(os.path.join(outdir, fname.format(ext='tiff')), dpi=200)
vmin=-10, vmax=10) clustering.add_legend(leg_dict, cm.ax_heatmap, loc='right') cm.gs.update(bottom=0.25, right=0.85, left=0.03) cm.savefig(os.path.join( outdir, "nogbm_clustermap_M_corr_linkage%d_heatmap%d.png" % (clust_n_ftr, n_probe_to_show)), dpi=200) cm.savefig(os.path.join( outdir, "nogbm_clustermap_M_corr_linkage%d_heatmap%d.tiff" % (clust_n_ftr, n_probe_to_show)), dpi=200) ## our samples only, all probes, beta values idx = ((bdat.columns.str.contains('GBM')) | (bdat.columns.str.contains('DURA'))) bdat_ours = bdat.loc[:, idx] bmad_ours = transformations.median_absolute_deviation(bdat_ours) row_colours_ours = row_colours_all.loc[bdat_ours.columns] x = clustering.dendrogram_with_colours(bdat_ours, row_colours_ours, fig_kws={'figsize': (5.5, 10)}, vertical=False, metric='correlation') fname = "ours_dendrogram_B_corr_all.{ext}" x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200) x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200)
# normalised version with counts / sum(counts) datan = data.divide(data.sum(axis=0), axis=1) contains_arr = ['MES', re.compile(r'RTK_I$'), 'RTK_II'] col_colours, legend_labels = clustering.generate_colour_map_dict( meta, 'subgroup', contains_arr, label='group', sample_names=data.columns, non_matching='gray', group_names=['Mesenchymal', 'RTK I', 'RTK II']) clustering.dendrogram_with_colours(data, col_colours, legend_labels=legend_labels, metric='euclidean', method='average') clustering.dendrogram_with_colours(data, col_colours, legend_labels=legend_labels, metric='euclidean', method='single') clustering.dendrogram_with_colours(data, col_colours, legend_labels=legend_labels, metric='correlation', method='average') clustering.dendrogram_with_colours(data, col_colours, legend_labels=legend_labels,
row_colours = pd.DataFrame('gray', index=our_dat.columns, columns=['']) row_colours.loc[row_colours.index.str.contains( r'eNSC[0-9]med')] = '#66c2a5' row_colours.loc[row_colours.index.str.contains( r'eNSC[0-9]mouse')] = '#fc8d62' row_colours.loc[row_colours.index.str.contains( r'mDura.[AN0-9]*mouse')] = '#8da0cb' row_colours.loc[row_colours.index.str.contains( r'mDura.[AN0-9]*human')] = '#e78ac3' for n_t in n_gene_try: fname = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t d = clustering.dendrogram_with_colours(our_dat.loc[mad.index[:n_t]], row_colours, fig_kws={'figsize': (10, 5.5)}) d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200) cm, _ = cluster_data_with_threshold(our_dat, n=n_t, mad=mad, col_colors=row_colours) cm.savefig(os.path.join( outdir, "clustermap_by_gene_corr_log_top%d_by_mad.png" % n_t), dpi=200) raise Exception("TODO: complete the script refactor") # bring in reference data
# normalise by read count cpm = (data + 1).divide((data + 1).sum(axis=0), axis=1) * 1e6 # transform log_data = np.log2(cpm) vst_data = variance_stabilizing_transform(cpm) mad_log_srt = median_absolute_deviation(log_data).sort_values( ascending=False) mad_vst_srt = median_absolute_deviation(vst_data).sort_values( ascending=False) for NGENE in [500, 1000, 1500, 2000, 2500]: out = clustering.dendrogram_with_colours( log_data.loc[mad_log_srt.index[:NGENE]], col_colours=col_colours, vertical=False, metric='correlation', method='average', ) out['fig'].savefig(os.path.join( outdir, "gbm_nsc_correlation_dendrogram_logtransform_top%d.png" % NGENE), dpi=200) out['fig'].savefig( os.path.join( outdir, "gbm_nsc_correlation_dendrogram_logtransform_top%d.pdf" % NGENE)) cg = clustering.plot_correlation_clustermap( log_data.loc[mad_log_srt.index[:NGENE]],
ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya.pdf")) # by cell line subgroups = data_all_n.columns.str.replace(r'[^0-9]*', '') ax = pca_plot_by_group_2d(y, subgroups=subgroups, ellipses=False, auto_scale=False) ax.legend(loc='upper left', frameon=True, facecolor='w', edgecolor='b') plt.tight_layout() ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya_byline.png"), dpi=200) ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya_byline.pdf")) # hierarchical clustering subgroups = pd.DataFrame( ['b'] * 3 + ['r'] * 2 + ['g'] * 5, index=data_all_n.columns, columns=['Prep type'] ) legend_lbl = {'FFPE': 'b', 'frozen': 'r', 'Poly(A)': 'g'} res = clustering.dendrogram_with_colours( comp_data, subgroups, legend_labels=legend_lbl, metric='correlation', method='average' ) res['fig'].savefig(os.path.join(outdir, 'clustering_dendrogram.png'), dpi=200) res['fig'].savefig(os.path.join(outdir, 'clustering_dendrogram.pdf')) # can add n_gene kwarg here to pick top N genes by MAD: cg = clustering.plot_correlation_clustermap(comp_data) cg.savefig(os.path.join(outdir, 'clustering_corr_map.png'), dpi=200) cg.savefig(os.path.join(outdir, 'clustering_corr_map.pdf'))
# Spearman rank correlation distance # pdist = spearman_rank_corr(dat) # dist = hc.distance.squareform(1 - pdist.values) # lnk = hc.linkage(dist) # dend = clustering.dendrogram_with_colours( # dat, # cc, # linkage=lnk, # vertical=True, # legend_labels=leg_dict, # fig_kws={'figsize': [14, 6]} # ) # Pearson correlation distance dend = clustering.dendrogram_with_colours(dat, cc, vertical=True, legend_labels=leg_dict, fig_kws={'figsize': [14, 6]}) # Pearson with a limited number of probes # dend = clustering.dendrogram_with_colours(dat.loc[mad.index[:5000]], cc, vertical=True, legend_labels=leg_dict, fig_kws={'figsize': [14, 6]}) dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_all_probes.png"), dpi=200) # similar, but clustermap (dendrogram + heatmap) gc = clustering.plot_clustermap( dat.loc[mad.index[:5000]], cmap='RdBu_r', col_linkage=dend['linkage'], col_colors=cc ) clustering.add_legend(leg_dict, gc.ax_heatmap, loc='right') gc.gs.update(bottom=0.2, right=0.82)
obj_salmon = loader.load_by_patient(['ICb1299', '3021'], source='salmon', type='cell_culture', include_control=False) # cluster plot tpm = filter.filter_by_cpm(obj_salmon.data, min_cpm=1, min_n_samples=4) batch_colours = common.COLOUR_BREWERS[len(obj_salmon.meta.batch.unique())] line_colours = common.COLOUR_BREWERS[2] cc = pd.DataFrame(line_colours[0], index=tpm.columns, columns=['Batch', 'Cell line']) aa, bb = obj_salmon.meta.batch.factorize() for i in range(aa.max()): cc.loc[aa == i, 'Batch'] = batch_colours[i] cc.loc[cc.index.str.contains('3021'), 'Cell line'] = line_colours[1] cg = clustering.dendrogram_with_colours( np.log2(tpm + eps), cc, ) cg['fig'].savefig(os.path.join(outdir, "dendrogram_pearson_log_tpm_all_genes.png"), dpi=200) # pca plot p = PCA() y = p.fit_transform(np.log2(tpm + eps).transpose()) fig = plt.figure() ax = fig.add_subplot(111) for a, b in enumerate(bb): ax.scatter( y[aa == a, 0], y[aa == a, 1], facecolor=batch_colours[a], edgecolor='k', s=30,
pdbg_log = np.log2(pdbg + eps) mad_log = transformations.median_absolute_deviation(pdbg_log).sort_values( ascending=False) row_colours = pd.DataFrame('gray', index=pdbg_log.columns, columns=['']) row_colours.loc[row_colours.index.str.contains('IPSC')] = '#fdc086' row_colours.loc[row_colours.index.str.contains( r'DURA[0-9]*_NSC')] = '#7fc97f' row_colours.loc[row_colours.index.str.contains('GIBCO')] = '#96daff' for n_t in n_gene_try: fname = "clustering_by_gene_corr_top%d_by_mad.{ext}" % n_t fname_log = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t d = clustering.dendrogram_with_colours( pdbg_log.loc[mad_log.index[:n_t]], row_colours, fig_kws={'figsize': (5.5, 10)}, vertical=False) d['fig'].savefig(os.path.join(outdir, fname_log.format(ext='png')), dpi=200) plt.draw() plt.close('all') # bring in reference data # IDs (if req), lab (appears in label), loader ref_dats = [ ( None, 'Barres et al.',