def compare_mad_genes(data, samples1, samples2=None, max_ng=10000, step=50): """ Test the overlap of two samples in terms of the top genes drawn (by descending MAD). :param data: :param samples1: :param samples2: Second sample set. If None, use all samples. :return: """ d1 = data.loc[:, samples1] if samples2 is not None: d2 = data.loc[:, samples2] else: d2 = data mad1 = transformations.median_absolute_deviation(d1).sort_values( ascending=False) mad2 = transformations.median_absolute_deviation(d2).sort_values( ascending=False) ng = np.arange(step, max_ng + 1, step) iu = [(mad1.index[:i].intersection(mad2.index[:i]), mad2.index[:i].union(mad2.index[:i])) for i in ng] iu_pct = np.array([t[0].size / float(t[1].size) * 100 for t in iu]) return ng, iu_pct
def plot_clustermap(data, yugene=False, n_genes=N_GENES, yugene_resolve_ties=False, **kwargs): if yugene: data = process.yugene_transform(data, resolve_ties=yugene_resolve_ties) kwargs.setdefault('cmap', 'RdBu_r') mad = transformations.median_absolute_deviation(data, axis=1).sort_values(ascending=False) top_mad = mad.iloc[:n_genes].index z = hierarchy.linkage(data.loc[top_mad].transpose(), method='average', metric='correlation') cg = clustering.plot_clustermap( data.loc[top_mad], col_linkage=z, **kwargs ) plt.setp( cg.ax_heatmap.xaxis.get_ticklabels(), rotation=90 ) cg.gs.update(bottom=0.2) # it is helpful to have access to the row index so we'll add it here # I *think* certain kwargs might cause this to fail (if no row dend has been computed?) so add a generic try-exc try: cg.row_index = top_mad[cg.dendrogram_row.reordered_ind] except Exception: pass return cg
def plot_correlation_clustermap(data, row_colors=None, n_gene=None, method='average'): """ :param n_gene: If supplied, this is the number of genes to use, ordered by descending MAD """ if n_gene is not None: # reduce data to the specified number using MAD mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) genes = mad.index[:n_gene] data = data.loc[genes] corr = 1. - data.corr() z = hc.linkage(corr, method=method) cg = sns.clustermap(corr, cmap='RdBu_r', row_colors=row_colors, col_colors=row_colors, row_linkage=z, col_linkage=z) plt.setp(cg.ax_heatmap.get_xticklabels(), rotation=90, fontsize=14) plt.setp(cg.ax_heatmap.get_yticklabels(), rotation=0, fontsize=14) # shift the margins a bit to fit axis tick labels cg.gs.update(bottom=0.2, right=0.8, top=0.99, left=0.01) return cg
def cluster_data_with_threshold(data, min_val=None, n=None, mad=None, min_over=2, transform=None, **kwargs): if min_val is not None and min_over is not None: idx = (data > min_val).sum(axis=1) > min_over data = data.loc[idx] if transform is not None: data = transform(data) if n is not None: if mad is None: mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) else: mad = mad.sort_values(ascending=False) if len(mad.index.intersection(data.index)) != data.shape[0]: raise AttributeError( "If a pre-computed MAD is supplied, it must contain all required entries" ) data = data.loc[mad.index[:n]] cm = clustering.plot_clustermap(data, cmap='RdBu_r', metric='correlation', **kwargs) cm.gs.update(bottom=0.2) return cm, mad
def hc_plot_dendrogram(data, row_colours, mad=None, n_ftr=3000, metric='correlation', **kwargs): """ For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using that many genes (selected in descending MAD order) :param data: Cols are samples, rows are genes (or similar) :param row_colours: As passed to dendrogram routine :param n_gene_arr: The values to test :return: """ if 'fig_kws' not in kwargs: kwargs['fig_kws'] = {'figsize': (5.5, 10)} if mad is None: mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) the_dat = data.loc[mad.index[:n_ftr]] fig_dict = clustering.dendrogram_with_colours(the_dat, row_colours, vertical=False, metric=metric, **kwargs) return fig_dict
def plot_dendrogram(obj_arr, n_by_mad=None, qn_method=None, eps=0.01, min_val=1, n_above_min=3, vertical=False, figsize=(7, 8), **kwargs): if len(obj_arr) > 1: the_obj = loader.MultipleBatchLoader(obj_arr) else: the_obj = obj_arr[0] the_obj = filter_loader(the_obj, min_val=min_val, n_above_min=n_above_min) dat = np.log2(the_obj.data + eps) if qn_method is not None: dat = transformations.quantile_normalisation(dat, method=qn_method) if n_by_mad is not None: mad = transformations.median_absolute_deviation(dat).sort_values( ascending=False) dat = dat.loc[mad.index[:n_by_mad]] cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta) dend = clustering.dendrogram_with_colours(dat, cc, vertical=vertical, legend_labels=leg_dict, fig_kws={'figsize': figsize}, **kwargs) return dend
def hc_plot_dendrogram_vary_n_gene(data, row_colours, mad=None, n_ftr=(1000, 2000, 3000, 5000, 10000), metric='correlation'): """ For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using that many genes (selected in descending MAD order) :param data: Cols are samples, rows are genes (or similar) :param row_colours: As passed to dendrogram routine :param n_gene_arr: The values to test :return: """ if mad is None: mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) fig_dict = {} for ng in n_ftr: the_dat = data.loc[mad.index[:ng]] d = clustering.dendrogram_with_colours(the_dat, row_colours, fig_kws={'figsize': (5.5, 10)}, vertical=False, metric=metric) fig_dict[ng] = d return fig_dict
def plot_correlation_clustermap(data, row_colors=None, n_gene=None, method='average', metric='correlation', distance=None, **kwargs): """ :param n_gene: If supplied, this is the number of genes to use, ordered by descending MAD :param kwargs: Passed to seaborn's `clustermap` """ if n_gene is not None: # reduce data to the specified number using MAD mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) genes = mad.index[:n_gene] data = data.loc[genes] rl = None if distance is not None: rl = hc.linkage(distance) sq = hc.distance.squareform(distance) else: rl = hc.linkage(data.transpose(), method=method, metric=metric) sq = hc.distance.squareform( hc.distance.pdist(data.transpose(), metric=metric)) # invert distance so that closer samples have a larger number # do this even if distances have been provided directly if metric == 'correlation': sq = 1 - sq # else: # TODO: add specific versions for other metrics if required # sq = max(sq.flat) - sq # make a dataframe for clustering so that the plot has correct labels sq = pd.DataFrame(data=sq, index=data.columns, columns=data.columns) cg = sns.clustermap(sq, cmap='RdBu_r', row_colors=row_colors, col_colors=row_colors, row_linkage=rl, col_linkage=rl, **kwargs) plt.setp(cg.ax_heatmap.get_xticklabels(), rotation=90, fontsize=14) plt.setp(cg.ax_heatmap.get_yticklabels(), rotation=0, fontsize=14) # shift the margins a bit to fit axis tick labels cg.gs.update(bottom=0.2, right=0.8, top=0.99, left=0.01) return cg
def compute_pairwise_corr(cell_line_dat, tissue_dat, n_genes=None): """ Used :param _dat: Two pd DataFrames containing cell line and tissue data :param n_genes: If supplied, this is an integer, Genes are ranked by MAD :return: """ if n_genes is not None: mad = transformations.median_absolute_deviation( pd.concat((cell_line_dat, tissue_dat), axis=1) ).sort_values(ascending=False) g = mad.index[:n_genes] cell_line_dat = cell_line_dat.loc[g] tissue_dat = tissue_dat.loc[g] return cell_line_dat.apply(lambda x: tissue_dat.corrwith(x), axis=0)
def pwise_corr_boxplot(dat, n_gene=None): # all pairwise comparisons if n_gene is not None: mad = transformations.median_absolute_deviation(dat).sort_values( ascending=False) genes = mad.index[:n_gene] dat = dat.loc[genes] the_corr = dat.corr() pwise_corr = [] for s in [r'eNSC.mouse', r'mDura.*mouse', r'mDura.*human']: ii = the_corr.index.str.contains(r'eNSC.med') jj = the_corr.columns.str.contains(s) pwise_corr.append(the_corr.loc[ii, jj].values.flatten()) fig = plt.figure() ax = fig.add_subplot(111) ax.boxplot(pwise_corr) ax.set_xticklabels([ 'eNSCmed - eNSCmouse', 'eNSCmed - iNSCmouse', 'eNSCmed - iNSChuman' ]) ax.set_ylabel('pairwise correlation') return ax
label_symbols = hkg + ['BMI1'] label_ens = gene_symbol_to_ensembl(label_symbols) hkg_dat = dat_n.loc[hkg_ens, sorted(dat_n.columns)] hkg_dat.index = pd.Index(hkg, name='') hkg_dat_rel = hkg_dat.divide(hkg_dat.loc[:, ref], axis=0) ax = hkg_dat_rel.transpose().plot.bar() ax.set_ylim([0, 3.4]) plt.tight_layout() ax.figure.savefig(os.path.join(outdir, 'housekeeping_levels.png'), dpi=200) # identifying stable HKG ranked_count = pd.Series(rankdata(median_count, method='ordinal'), index=median_count.index) ranked_perc = ranked_count / float(ranked_count.shape[0]) mad = transformations.median_absolute_deviation(dat_n) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(ranked_perc, np.log10(median_count)) ax.scatter(ranked_perc.loc[label_ens], np.log10(median_count.loc[label_ens]), c='r') for g, e in zip(label_symbols, label_ens): ax.text(ranked_perc.loc[e], np.log10(median_count.loc[e]), g) ax.set_xlabel("Abundance percentile") ax.set_ylabel("Log10 normalised abundance") ax.figure.savefig(os.path.join(outdir, 'hkg_abundance.png'), dpi=200) # show the total variation using fill_between min_count = dat_n.min(axis=1)
min_n_samples=1) matched_log_cpm = log_cpm(matched_data) row_colours = pd.DataFrame(common.COLOUR_BREWERS[2][0], index=matched_data.columns, columns=['Library']) row_colours.loc[row_colours.index.str.contains( 'smartseq')] = common.COLOUR_BREWERS[2][1] # clustering plot cg = clustering.plot_correlation_clustermap(matched_log_cpm, row_colors=row_colours) cg.gs.update(bottom=0.35, right=0.65) cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_all_genes.png"), dpi=200) mad = transformations.median_absolute_deviation( matched_log_cpm).sort_values(ascending=False) cg = clustering.plot_correlation_clustermap( matched_log_cpm.loc[mad.index[:3000]], row_colors=row_colours) cg.gs.update(bottom=0.35, right=0.65) cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_3000_genes.png"), dpi=200) # repeat with TMM norming matched_log_cpm_n = transformations.edger_tmm_normalisation_cpm( matched_data) cg = clustering.plot_correlation_clustermap(matched_log_cpm_n, row_colors=row_colours) cg.gs.update(bottom=0.35, right=0.65) cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_all_genes_tmm.png"), dpi=200)
def plot_clustermap(obj, quantile_norm, method='average', metric='correlation', n_gene_by_mad=5000, n_gene_for_heatmap=500, fmin=0.05, fmax=0.95, eps=0.01, cell_line_colours=None): if cell_line_colours is None: cell_line_colours = { 'FB': '#fff89e', # yellow 'GBM (this study)': '#e6e6e6', # light gray 'GBM': '#4d4d4d', # dark grey 'ESC': '#ff7777', # light red 'iPSC': '#990000', # dark red 'iPSC (this study)': '#fdc086', # orange 'NSC': '#006600', # dark green 'iNSC (this study)': '#7fc97f', # green } the_dat = np.log2(obj.data + eps) if quantile_norm is not None: the_dat = transformations.quantile_normalisation(the_dat, method=quantile_norm) the_mad = transformations.median_absolute_deviation(the_dat).sort_values( ascending=False) cc, st, leg_dict = construct_colour_array_legend_studies(obj.meta) # linkage lkg = hc.linkage( the_dat.loc[the_mad.index[:n_gene_by_mad]].transpose(), method=method, metric=metric, ) # ref line colours for k, v in cell_line_colours.items(): cc.loc[obj.meta.type == k, 'Cell type'] = v # our line colours cc.loc[obj.meta.batch.str.contains('wtchg') & (obj.meta.type == 'iPSC'), 'Cell type'] = \ cell_line_colours['iPSC (this study)'] # get appropriate clims the_dat = the_dat.loc[the_mad.index[:n_for_heatmap]] the_dat_flat = np.sort(the_dat.values.flatten()) vmin = the_dat_flat[int(len(the_dat_flat) * fmin)] - 0.5 vmax = the_dat_flat[int(len(the_dat_flat) * fmax)] + 0.5 gc = clustering.plot_clustermap( the_dat.loc[the_mad.index[:n_gene_for_heatmap]], cmap='RdBu_r', col_linkage=lkg, col_colors=cc, vmin=vmin, vmax=vmax, ) leg_entry = { 'class': 'patch', 'edgecolor': 'k', 'linewidth': 1., } leg_dict2 = collections.OrderedDict() leg_dict2['Cell type'] = collections.OrderedDict() for k in sorted(cell_line_colours): if k.replace(' (this study)', '') in obj.meta.type.unique(): leg_dict2['Cell type'][k] = dict(leg_entry) leg_dict2['Cell type'][k].update( {'facecolor': cell_line_colours[k]}) leg_dict2['Study'] = {} for k, v in leg_dict['Study'].items(): leg_dict2['Study'][k] = dict(leg_entry) leg_dict2['Study'][k].update({'facecolor': v}) common.add_custom_legend(gc.ax_heatmap, leg_dict2, loc_outside=True, fontsize=14) format_clustermap(gc) return gc
5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 400, 500, 750, 1000 ] nrem = [(gene_idx & ((obj.data > t).sum(axis=1) > 10)).sum() for t in co] fig = plt.figure() ax = fig.add_subplot(111) ax.plot(co, nrem) ax.set_xlabel("10 / 21 samples must have count higher than x") ax.set_ylabel("Number of genes remaining") fig.tight_layout() fig.savefig(os.path.join(outdir, "filtering_effect.png"), dpi=200) fig.savefig(os.path.join(outdir, "filtering_effect.pdf")) idx = gene_idx & ((obj.data > 10).sum(axis=1) > 10) data = obj.data.loc[idx] mad = transformations.median_absolute_deviation(data).sort_values( ascending=False) logdata = np.log(data + 1) # start with a dendrogram col_colours = clustering.generate_colour_map_dict(obj.meta, 'sample', matches, label='Patient', non_matching='gray') out = clustering.dendrogram_with_colours(logdata, col_colours=col_colours, vertical=False) dist = clustering.dendrogram_threshold_by_nclust(out['linkage'], 3) out['dendrogram_ax'].axvline(dist, ls='--', c='gray') out['fig'].savefig(os.path.join(outdir, "dendrogram_all_genes.png"), dpi=200)
abg = pd.concat((patient_data_by_gene, ref_by_gene), axis=1) # discard mitochondrial genes if remove_mt: idx = ~abg.index.isin(mt_ensg) abg = abg.loc[idx] # renorm if units == 'tpm': abg = abg.divide(abg.sum(), axis=1) * 1e6 # discard genes expressed at low values idx = (abg > min_val).sum(axis=1) > min_n abg = abg.loc[idx] abg_log = np.log2(abg + eps) amad_log = transformations.median_absolute_deviation(abg_log).sort_values( ascending=False) if units == 'estimated_counts': # optionally could normalise here? pass row_colours_all = pd.DataFrame('gray', index=abg.columns, columns=['']) row_colours_all.loc[row_colours_all.index.str.contains(r'NSC')] = 'blue' row_colours_all.loc[row_colours_all.index.str.contains(r'NPC')] = 'blue' row_colours_all.loc[row_colours_all.index.str.contains( r'GIBCO')] = '#96daff' row_colours_all.loc[row_colours_all.index.str.contains( r'Fibroblast')] = '#fff89e' row_colours_all.loc[row_colours_all.index.str.contains( r'Fetal')] = 'yellow' row_colours_all.loc[row_colours_all.index.str.contains('ES1')] = '#ff7777'
meta.loc[:, 'study'] = STUDY data = data.loc[:, meta.index] if 'EYS' in data.index: idx = data.index.str.replace('EYS', 'EGFL11') data.index = idx # Kool # STUDY = 'Kool' # data, meta = microarray_data.load_annotated_microarray_gse10327(aggr_field='SYMBOL', aggr_method='max_std') # Northcott # STUDY = 'Northcott' # data, meta = microarray_data.load_annotated_microarray_gse37382(aggr_field='SYMBOL', aggr_method='max_std') # find top genes by MAD - all genes included mad = transformations.median_absolute_deviation(data, axis=1) top_genes = mad.sort_values(ascending=False).index[:n_genes] print "Selecting top %d genes by MAD from %s study..." % (n_genes, STUDY) print "%d / %d genes (nanostring)" % (len( top_genes.intersection(all_nstring)), len(all_nstring)) print "%d / %d genes (northcott)" % (len( top_genes.intersection(all_ncott)), len(all_ncott)) # Zhao data zhao_sample_names = ( 'Pt1299', 'Pt1487', 'Pt1595', 'ICb1299-III', 'ICb1299-IV', 'ICb1487-I',
nz_idx = ( ((data_nsc < MIN_COUNT).sum(axis=1) < MAX_BELOW) | ((data_nsc > HIGH_COUNT).any(axis=1)) ) data_nsc_nz = data_nsc.loc[nz_idx, :] # yugene data_nsc_nz_yg = process.yugene_transform(data_nsc_nz) # add one, norm, take log data_nsc_nz += 1 data_nsc_nz = data_nsc_nz.divide(data_nsc_nz.sum(axis=0), axis=1) data_nsc_nz = np.log(data_nsc_nz + 1) # MAD - compute on normalised values mad_nsc_nz = transformations.median_absolute_deviation(data_nsc_nz).sort_values(ascending=False) top_idx = mad_nsc_nz.index[:N_GENES] rem_idx = mad_nsc_nz.index[N_GENES:] # reduce number of remainder for plotting purposes to_discard = rem_idx[np.random.permutation(rem_idx.size)[N_GENES:]] data_nsc_nz = data_nsc_nz.drop(to_discard) # add 'hue' column data_nsc_nz.loc[:, 'hue'] = 'Remainder' data_nsc_nz.loc[top_idx, 'hue'] = 'Top %d by MAD' % N_GENES # generate the plot # pg = sns.pairplot(data_nsc_nz, hue='hue') # repeat on YuGene
the_dat = mdat plt_dict = hc_plot_dendrogram_vary_n_gene(the_dat, row_colours_all, n_ftr=[clust_n_ftr]) for ng, x in plt_dict.items(): fname = "dendrogram_M_corr_top%d_by_mad.{ext}" % ng x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200) x['fig'].savefig(os.path.join(outdir, fname.format(ext='tiff')), dpi=200) # heatmap: use clustering from n=20000 probes (M vals), but show fewer probes values # pick these using MAD this_mad = transformations.median_absolute_deviation(mdat).sort_values( ascending=False) this_dat = mdat.loc[this_mad.index[:n_probe_to_show]] leg_entry = { 'class': 'patch', 'edgecolor': 'k', 'linewidth': 1., } lkg = plt_dict[clust_n_ftr]['linkage'] leg_dict = collections.OrderedDict() for k in sorted(cell_line_colours): if cell_line_colours[k] in row_colours_all.values: leg_dict[k] = dict(leg_entry) leg_dict[k].update({'facecolor': cell_line_colours[k]})
non_matching='gray') # filter data = filter.filter_by_cpm(data, min_cpm=1, min_n_samples=3, unless_cpm_gt=10) # data = filter.filter_by_cpm(data, min_cpm=1, min_n_samples=3, unless_cpm_gt=None) # normalise by read count cpm = (data + 1).divide((data + 1).sum(axis=0), axis=1) * 1e6 # transform log_data = np.log2(cpm) vst_data = variance_stabilizing_transform(cpm) mad_log_srt = median_absolute_deviation(log_data).sort_values( ascending=False) mad_vst_srt = median_absolute_deviation(vst_data).sort_values( ascending=False) for NGENE in [500, 1000, 1500, 2000, 2500]: out = clustering.dendrogram_with_colours( log_data.loc[mad_log_srt.index[:NGENE]], col_colours=col_colours, vertical=False, metric='correlation', method='average', ) out['fig'].savefig(os.path.join( outdir, "gbm_nsc_correlation_dendrogram_logtransform_top%d.png" % NGENE), dpi=200)
for ng, x in plt_dict.items(): if ng > the_dat.shape[0]: fname = "dendrogram_M_corr_all.{ext}" else: fname = "dendrogram_M_corr_top%d_by_mad.{ext}" % ng x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200) x['fig'].savefig(os.path.join(outdir, fname.format(ext='tiff')), dpi=200) plt.close('all') # heatmap: use clustering from n=20000 probes (M vals), but only show top 500 most variable between clusters clust_n_ftr = 20000 n_probe_to_show = 500 lkg = plt_dict[clust_n_ftr]['linkage'] this_mad = transformations.median_absolute_deviation(mdat).sort_values( ascending=False) this_dat = mdat.loc[this_mad.index[:n_probe_to_show]] # heatmap for 3000 probes cm = clustering.plot_clustermap(this_dat, cmap='RdYlBu_r', metric='correlation', col_colors=row_colours_all, col_linkage=lkg, vmin=-10, vmax=10, figsize=(11.8, 10.)) cm.gs.update(bottom=0.25, right=0.99) cm.savefig(os.path.join( outdir, "clustermap_M_corr_linkage%d_heatmap%d.png" %
idx = (pdbg > min_val).sum(axis=1) > min_n pdbg = pdbg.loc[idx] if units == 'estimated_counts': # here we can normalise by library size if desired pass ax = hist_logvalues(patient_data, thresholds=[min_val]) ax.figure.savefig(os.path.join( outdir, "log2_intensities_by_gene_with_min_tpm_threshold.png"), dpi=200) ax.figure.savefig( os.path.join(outdir, "log2_intensities_by_gene_with_min_tpm_threshold.pdf")) mad = transformations.median_absolute_deviation(pdbg).sort_values( ascending=False) pdbg_log = np.log2(pdbg + eps) mad_log = transformations.median_absolute_deviation(pdbg_log).sort_values( ascending=False) row_colours = pd.DataFrame('gray', index=pdbg_log.columns, columns=['']) row_colours.loc[row_colours.index.str.contains('IPSC')] = '#fdc086' row_colours.loc[row_colours.index.str.contains( r'DURA[0-9]*_NSC')] = '#7fc97f' row_colours.loc[row_colours.index.str.contains('GIBCO')] = '#96daff' for n_t in n_gene_try: fname = "clustering_by_gene_corr_top%d_by_mad.{ext}" % n_t fname_log = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t d = clustering.dendrogram_with_colours( pdbg_log.loc[mad_log.index[:n_t]],