def load_methylation(pids, ref_names=None, norm_method='swan', ref_name_filter=None, units='beta'): """ Load and prepare the Illumina methylation data """ # patient data obj = loader.load_by_patient(pids, norm_method=norm_method) anno = loader.load_illumina_methylationepic_annotation() # reference data if ref_names is not None: ref_obj = loader.load_reference(ref_names, norm_method=norm_method) if ref_name_filter is not None: ref_obj.filter_by_sample_name(ref_name_filter, exact=True) obj = loader.loader.MultipleBatchLoader([obj, ref_obj]) me_data = obj.data.dropna() if units == 'm': me_data = process.m_from_beta(me_data) # reduce anno and data down to common probes common_probes = anno.index.intersection(me_data.index) anno = anno.loc[common_probes] # dmr.add_merged_probe_classes(anno) me_data = me_data.loc[common_probes] obj.data = me_data return obj, anno
def combine_data_meta(data_arr, meta_arr, units='beta'): if len(data_arr) != len(meta_arr): raise ValueError("data_arr and meta_arr must have the same size") # include all probes again dat = pd.concat( data_arr, axis=1, join='inner' ) meta = pd.concat( meta_arr, axis=0, join='outer', sort=True ) if units.lower() == 'm': # convert to M values dat = process.m_from_beta(dat) # drop any infinite valued probes (should be very few) inft = (~np.isfinite(dat)).sum(axis=1) > 0 if inft.any(): dat = dat.loc[~inft] print "Dropped %d probes with infinite M values" % inft.sum() return meta, dat
# remove a few # ix = obj.meta.type != 'astrocyte' # obj.filter_samples(ix) # # ix = obj.meta.type != 'iAPC' # obj.filter_samples(ix) # # ix = ~obj.meta.index.str.contains('GBM') # obj.filter_samples(ix) # ix = obj.meta.index != 'H9 NPC (Encode EPIC)' # obj.filter_samples(ix) bdat = obj.data mdat = process.m_from_beta(bdat) if qn_method is not None: mdat = transformations.quantile_normalisation(mdat, method=qn_method) # tidy up batch IDs obj.meta.loc[obj.meta.batch.isnull(), 'batch'] = obj.meta.loc[obj.meta.batch.isnull(), 'batch_1'] obj.meta.batch = obj.meta.batch.str.replace('2016-12-19_ucl_genomics', '2016-12-19') # the only batch names without letters are ours obj.meta.loc[~obj.meta.batch.str.contains(r'[A-Z]'), 'batch'] = 'This study' # PCA plot (by batch and cell type)
# 'p62_3_shB+C': 'shBMI1shCHD7', # 'p62_3_Scr': 'scramble', # # }) # condition = condition.loc[meta.index] # meta.insert(0, 'condition', condition) # # cell_line = pd.Series('3021', index=meta.index) # cell_line[cell_line.index.str.contains('1299')] = 'ICb1299' # cell_line[cell_line.index.str.contains('p62')] = 'ICb1299' # meta.insert(0, 'cell_line', cell_line) anno = loader.load_illumina_methylationepic_annotation() me_data = obj.data.dropna() me_data = process.m_from_beta(me_data) # reduce anno and data down to common probes common_probes = anno.index.intersection(me_data.index) anno = anno.loc[common_probes] # plot PCA p = pca.PCA() pca_dat = p.fit_transform(me_data.transpose()) fig = plt.figure() ax = fig.add_subplot(111) marker_groups = meta.cell_line
anno = loader.load_illumina_methylationepic_annotation() our_obj.meta.insert( 0, 'patient_id', our_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) # load validation data val_obj = loader.load_reference('GSE92462_450k', norm_method=norm_method) # filter val_obj.filter_samples(val_obj.meta.type.isin(['GBM (GSC)', 'NSC'])) # TODO: upload to the classifier and run (toggle this so it's only run once) # combine and reduce probes obj = loader.loader.MultipleBatchLoader([our_obj, val_obj]) dat = process.m_from_beta(obj.data) meta = obj.meta common_probes = anno.index.intersection(dat.index) dat = dat.reindex(common_probes) anno = anno.reindex(common_probes) dmr_hash_dict = dict(dmr_params) dmr_hash_dict['norm_method'] = norm_method the_hash = tsgd.dmr_results_hash(meta.sort_index().index.tolist(), dmr_hash_dict) filename = 'dmr_results_450k_validation.%d.pkl' % the_hash fn = os.path.join(DMR_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Loading pre-computed DMR results from %s", fn)
ffpe_obj = loader.load_by_patient(pids, norm_method=norm_method, type='ffpe') anno = loader.load_illumina_methylationepic_annotation() # add patient ID column to metadata cc_obj.meta.insert( 0, 'patient_id', cc_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) ffpe_obj.meta.insert( 0, 'patient_id', [hgic_consts.NH_ID_TO_PATIENT_ID_MAP[t] for t in ffpe_obj.meta.index]) ffpe_obj.meta.insert(1, 'type', 'ffpe') dat_cc = process.m_from_beta(cc_obj.data).sort_index() # replace CC data with those normed differently (in R) # dat_cc = pd.read_csv('cell_culture_swan_one_norm.csv', header=0, index_col=0).sort_index() dat_ffpe = process.m_from_beta(ffpe_obj.data).sort_index() dat = pd.concat((dat_cc, dat_ffpe), axis=1, join='inner') meta = pd.concat((cc_obj.meta, ffpe_obj.meta), axis=0, join='outer', sort=True) meta.loc[meta.batch.isnull(), 'batch'] = meta.loc[meta.batch.isnull(), 'batch_1'] common_probes = anno.index.intersection(dat.index) dat = dat.reindex(common_probes) anno = anno.reindex(common_probes)
logger = log.get_console_logger() if __name__ == '__main__': outdir = output.unique_output_dir() # load methylation and DMR data meth_obj = methylation_loader.load_by_patient(consts.PIDS, include_control=False) meth_obj.filter_by_sample_name(consts.S1_METHYL_SAMPLES_GIC + consts.S1_METHYL_SAMPLES_INSC) meth_obj.meta.insert( 0, 'patient_id', meth_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) mdat = process.m_from_beta(meth_obj.data) norm_method_s1 = 'swan' dmr_params = consts.DMR_PARAMS de_params = consts.DE_PARAMS DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr') DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de') anno = methylation_loader.load_illumina_methylationepic_annotation() # use a hash on the PIDs and parameters to ensure we're looking for the right results dmr_hash_dict = dict(dmr_params) dmr_hash_dict['norm_method'] = norm_method_s1 # load DMR results
k = 'GIC-FB syn' samples = consts.S1_METHYL_SAMPLES_GIC + consts.S1_METHYL_SAMPLES_FB this_pids = ['018', '019', '030', '031', '017', '050', '054', '026', '052'] pids_included[k] = this_pids try: dmr_res_fb_syn = load_dmr_results(anno, samples) except IOError: logger.info("%s. Computing results.", k) fn = get_hashed_filename(samples, norm_method=norm_method, dmr_params=dmr_params) me_obj, this_anno = load_methylation_data(samples, anno, norm_method=norm_method) me_data = process.m_from_beta(me_obj.data) data_loaded[k] = me_obj dmr_res_fb_syn = tsgd.paired_dmr(me_data, me_obj.meta, this_anno, this_pids, dmr_params, type1='GBM', type2='FB') dmr_res_fb_syn.to_pickle(fn, include_annotation=False) logger.info("Saved DMR results to %s", fn) all_results[k] = dmr_res_fb_syn # GIC-iAPC (syngeneic)
axs[j, i].scatter(dat.loc[idx, cols[0]], dat.loc[idx, cols[1]]) axs[j, i].set_title( "%s%s r=%.3f" % (lbl, pid, stats.linregress(dat.loc[idx, cols[0]], dat.loc[idx, cols[1]]).rvalue)) fig.tight_layout() return fig, axs if __name__ == '__main__': outdir = output.unique_output_dir("methylation_replicates", reuse_empty=True) pids = ['017', '050', '054', '061'] b, me_meta = methylation_array.load_by_patient(pids) m = process.m_from_beta(b) mad = transformations.median_absolute_deviation(m).sort_values( ascending=False) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(mad.values) ax.set_xlabel("Probe rank by MAD") ax.set_ylabel("MAD value") ax.axvline(50000, ls='--', c='r') fig.savefig(os.path.join(outdir, "MAD_sorted.png"), dpi=200) fig1, axs1 = scatter_plots(m, pids) fig1.savefig(os.path.join(outdir, "correlation_all_probes.png"), dpi=200) fig2, axs2 = scatter_plots(m, pids, mad=mad, top_n_by_mad=50000) fig2.savefig(os.path.join(outdir, "correlation_top_50000.png"), dpi=200)