def load_methylation(pids, ref_names=None, norm_method='swan', ref_name_filter=None, units='beta'): """ Load and prepare the Illumina methylation data """ # patient data obj = loader.load_by_patient(pids, norm_method=norm_method) anno = loader.load_illumina_methylationepic_annotation() # reference data if ref_names is not None: ref_obj = loader.load_reference(ref_names, norm_method=norm_method) if ref_name_filter is not None: ref_obj.filter_by_sample_name(ref_name_filter, exact=True) obj = loader.loader.MultipleBatchLoader([obj, ref_obj]) me_data = obj.data.dropna() if units == 'm': me_data = process.m_from_beta(me_data) # reduce anno and data down to common probes common_probes = anno.index.intersection(me_data.index) anno = anno.loc[common_probes] # dmr.add_merged_probe_classes(anno) me_data = me_data.loc[common_probes] obj.data = me_data return obj, anno
# 'ICb1299_shBMI1CHD7': 'shBMI1shCHD7', # 'p62_3_shBmi1': 'shBMI1', # 'p62_3_shChd7': 'shCHD7', # 'p62_3_shB+C': 'shBMI1shCHD7', # 'p62_3_Scr': 'scramble', # # }) # condition = condition.loc[meta.index] # meta.insert(0, 'condition', condition) # # cell_line = pd.Series('3021', index=meta.index) # cell_line[cell_line.index.str.contains('1299')] = 'ICb1299' # cell_line[cell_line.index.str.contains('p62')] = 'ICb1299' # meta.insert(0, 'cell_line', cell_line) anno = loader.load_illumina_methylationepic_annotation() me_data = obj.data.dropna() me_data = process.m_from_beta(me_data) # reduce anno and data down to common probes common_probes = anno.index.intersection(me_data.index) anno = anno.loc[common_probes] # plot PCA p = pca.PCA() pca_dat = p.fit_transform(me_data.transpose()) fig = plt.figure()
df_row = df.loc[ix].tolist() if pd.isna(row.UCSC_RefGene_Name): gr = [('', '')] else: g = row.UCSC_RefGene_Name.split(';') r = row.UCSC_RefGene_Group.split(';') gr = sorted(set(zip(g, r))) for t in gr: this_res.append([ix] + df_row + [row.CHR, row.Strand, row.MAPINFO, t[0], t[1]]) return pd.DataFrame(this_res, columns=['probe_id'] + df.columns.tolist() + anno_cols) if __name__ == '__main__': anno = loader.load_illumina_methylationepic_annotation(split_genes=False) # 1. Annotate DMPs and re-export to Excel # dmp_fns = glob(os.path.join(GIT_LFS_DATA_DIR, 'mb_dmp', '*.xlsx')) dmp_fns = glob(os.path.join(os.path.expanduser('~/temp'), '*.xlsx')) print "Found %d relevant input (DMP) files: %s" % (len(dmp_fns), ', '.join(dmp_fns)) outdir = output.unique_output_dir("mb_dmps") res = {} for fn in dmp_fns: base = os.path.splitext(os.path.basename(fn))[0] res[base] = {} dat = pd.read_excel(fn, sheet_name=None)