def plot_dendrogram(obj_arr, n_by_mad=None, qn_method=None, eps=0.01, min_val=1, n_above_min=3, vertical=False, figsize=(7, 8), **kwargs): if len(obj_arr) > 1: the_obj = loader.MultipleBatchLoader(obj_arr) else: the_obj = obj_arr[0] the_obj = filter_loader(the_obj, min_val=min_val, n_above_min=n_above_min) dat = np.log2(the_obj.data + eps) if qn_method is not None: dat = transformations.quantile_normalisation(dat, method=qn_method) if n_by_mad is not None: mad = transformations.median_absolute_deviation(dat).sort_values( ascending=False) dat = dat.loc[mad.index[:n_by_mad]] cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta) dend = clustering.dendrogram_with_colours(dat, cc, vertical=vertical, legend_labels=leg_dict, fig_kws={'figsize': figsize}, **kwargs) return dend
# obj.filter_samples(ix) # # ix = obj.meta.type != 'iAPC' # obj.filter_samples(ix) # # ix = ~obj.meta.index.str.contains('GBM') # obj.filter_samples(ix) # ix = obj.meta.index != 'H9 NPC (Encode EPIC)' # obj.filter_samples(ix) bdat = obj.data mdat = process.m_from_beta(bdat) if qn_method is not None: mdat = transformations.quantile_normalisation(mdat, method=qn_method) # tidy up batch IDs obj.meta.loc[obj.meta.batch.isnull(), 'batch'] = obj.meta.loc[obj.meta.batch.isnull(), 'batch_1'] obj.meta.batch = obj.meta.batch.str.replace('2016-12-19_ucl_genomics', '2016-12-19') # the only batch names without letters are ours obj.meta.loc[~obj.meta.batch.str.contains(r'[A-Z]'), 'batch'] = 'This study' # PCA plot (by batch and cell type) colour_subgroups = obj.meta.batch c_sub_sorted = sorted(colour_subgroups.unique(), key=lambda x: 'A' if x == 'This study' else x)
rna_cc_dat = filter.filter_by_cpm(rna_cc_dat, min_cpm=min_tpm, min_n_samples=2) # reduce to matching probes probes = rna_cc_dat.index.intersection(rna_ff_dat.index) if remove_mt: probes = probes[~probes.isin(mt_ens)] rna_ff_dat = np.log2(rna_ff_dat.loc[probes] + eps) rna_cc_dat = np.log2(rna_cc_dat.loc[probes] + eps) # QN if apply_qn: rna_ff_dat = transformations.quantile_normalisation(rna_ff_dat) rna_cc_dat = transformations.quantile_normalisation(rna_cc_dat) # correlation plot pdist = pd.DataFrame(index=rna_ff_dat.columns.sort_values(), columns=rna_cc_dat.columns.sort_values(), dtype=float) for ff in pdist.index: for cc in pdist.columns: if dist_metric == 'pearson': pdist.loc[ff, cc] = stats.pearsonr(rna_ff_dat[ff], rna_cc_dat[cc])[0] elif dist_metric == 'spearman': pdist.loc[ff, cc] = stats.spearmanr(rna_ff_dat[ff], rna_cc_dat[cc]).correlation else:
if labels_included[csg]: lbl = None else: lbl = csg labels_included[csg] = True ax.plot(xi, yi, c=colour_map[csg], label=lbl) ax.set_xlabel('M value') ax.set_ylabel('ECDF') ax.set_title(ct) common.legend_outside_axes(ax) fig.subplots_adjust(left=0.1, bottom=0.1, right=0.8, top=0.99) fig.savefig(os.path.join(outdir, "ecdf_batches_%s.png" % ct), dpi=200) # normalise and repeat PCA mdat_qn = transformations.quantile_normalisation(mdat) fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(111) p_qn, ax = plot_pca(mdat_qn, colour_subgroups, marker_subgroups=m_subgroups, marker_map=mmap, ax=ax) ax.figure.subplots_adjust(left=0.1, right=0.8) ax.figure.savefig(os.path.join(outdir, "pca_plot_batch_cell_type_all_qn.png"), dpi=200) # now try without GBM # (a) no QN
obj1.filter_samples(ix) dend = plot_dendrogram([obj1, ref_obj, nsc_ref_obj], vertical=False, figsize=(7, 14), qn_method=quantile_norm, n_by_mad=n_gene_by_mad) dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_nsc.png"), dpi=200) # 1b. Heatmap from clustering result of (1a) n_for_heatmap = 500 the_obj = loader.MultipleBatchLoader([obj1, ref_obj, nsc_ref_obj]) the_dat = np.log2(the_obj.data + eps) if quantile_norm is not None: the_dat = transformations.quantile_normalisation(the_dat, method=quantile_norm) the_mad = transformations.median_absolute_deviation(the_dat).sort_values( ascending=False) cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta) # ref line colours for k, v in cell_line_colours.items(): cc.loc[the_obj.meta.type == k, 'Cell type'] = v # our line colours cc.loc[the_obj.meta.batch.str.contains('wtchg') & (the_obj.meta.type == 'iNSC'), 'Cell type'] = cell_line_colours['iNSC (this study)'] cc.loc[the_obj.meta.batch.str.contains('wtchg') & (the_obj.meta.type == 'iPSC'), 'Cell type'] = cell_line_colours['iPSC (this study)']
def plot_clustermap(obj, quantile_norm, method='average', metric='correlation', n_gene_by_mad=5000, n_gene_for_heatmap=500, fmin=0.05, fmax=0.95, eps=0.01, cell_line_colours=None): if cell_line_colours is None: cell_line_colours = { 'FB': '#fff89e', # yellow 'GBM (this study)': '#e6e6e6', # light gray 'GBM': '#4d4d4d', # dark grey 'ESC': '#ff7777', # light red 'iPSC': '#990000', # dark red 'iPSC (this study)': '#fdc086', # orange 'NSC': '#006600', # dark green 'iNSC (this study)': '#7fc97f', # green } the_dat = np.log2(obj.data + eps) if quantile_norm is not None: the_dat = transformations.quantile_normalisation(the_dat, method=quantile_norm) the_mad = transformations.median_absolute_deviation(the_dat).sort_values( ascending=False) cc, st, leg_dict = construct_colour_array_legend_studies(obj.meta) # linkage lkg = hc.linkage( the_dat.loc[the_mad.index[:n_gene_by_mad]].transpose(), method=method, metric=metric, ) # ref line colours for k, v in cell_line_colours.items(): cc.loc[obj.meta.type == k, 'Cell type'] = v # our line colours cc.loc[obj.meta.batch.str.contains('wtchg') & (obj.meta.type == 'iPSC'), 'Cell type'] = \ cell_line_colours['iPSC (this study)'] # get appropriate clims the_dat = the_dat.loc[the_mad.index[:n_for_heatmap]] the_dat_flat = np.sort(the_dat.values.flatten()) vmin = the_dat_flat[int(len(the_dat_flat) * fmin)] - 0.5 vmax = the_dat_flat[int(len(the_dat_flat) * fmax)] + 0.5 gc = clustering.plot_clustermap( the_dat.loc[the_mad.index[:n_gene_for_heatmap]], cmap='RdBu_r', col_linkage=lkg, col_colors=cc, vmin=vmin, vmax=vmax, ) leg_entry = { 'class': 'patch', 'edgecolor': 'k', 'linewidth': 1., } leg_dict2 = collections.OrderedDict() leg_dict2['Cell type'] = collections.OrderedDict() for k in sorted(cell_line_colours): if k.replace(' (this study)', '') in obj.meta.type.unique(): leg_dict2['Cell type'][k] = dict(leg_entry) leg_dict2['Cell type'][k].update( {'facecolor': cell_line_colours[k]}) leg_dict2['Study'] = {} for k, v in leg_dict['Study'].items(): leg_dict2['Study'][k] = dict(leg_entry) leg_dict2['Study'][k].update({'facecolor': v}) common.add_custom_legend(gc.ax_heatmap, leg_dict2, loc_outside=True, fontsize=14) format_clustermap(gc) return gc