indir = os.path.join(DATA_DIR, 'rnaseq', 'wtchg_p160704') lanedirs = [ os.path.join(indir, '161219_K00198_0151_BHGYHTBBXX'), os.path.join(indir, '161222_K00198_0152_AHGYG3BBXX'), ] metafiles = [os.path.join(d, 'sources.csv') for d in lanedirs] countdirs = [os.path.join(d, 'star_alignment_mouse') for d in lanedirs] obj2 = rnaseq_data.all_samples_multilane_loader(countdirs, metafiles, source='star', annotate_by='Ensembl Gene ID', samples=('ICb1078', 'ICb1487')) obj = rnaseq_data.MultipleBatchLoader([obj1, obj2]) data = obj.data.loc[obj.data.index.str.contains('ENS')] cpm = data.divide(obj.meta.read_count, axis=1) * 1e6 cpm = cpm.astype(float) # only keep abundant genes: results in around 12,000 remaining genes keep = (cpm > 1).sum(axis=1) > 5 fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) ax = corr.plot_correlation_coefficient_array(data.loc[keep], vmin=0.6, ax=ax) plt.setp(ax.xaxis.get_ticklabels(), rotation=90) fig.tight_layout() # 2) What are the correlation coeffs when we look at individual lanes? # Only include the endogenous NSC in the endogenous medium
]) obj52564 = rnaseq_data.gse52564(annotate_by='Ensembl Gene ID', samples=[ 'Astrocyte1', 'Astrocyte2', 'Neuron1', 'Neuron2', 'OPC1', 'OPC2', ]) obj43916 = rnaseq_data.gse43916(annotate_by='Ensembl Gene ID', samples=['NSCs']) obj86248 = rnaseq_data.gse86248(annotate_by='Ensembl Gene ID') obj36114 = rnaseq_data.gse36114(annotate_by='Ensembl Gene ID') obj_all = rnaseq_data.MultipleBatchLoader( [obj, obj64411, obj52564, obj43916, obj86248, obj36114]) data_all = obj_all.data.loc[obj_all.data.index.str.contains('ENS')] meta_all = obj_all.meta cpm_all = data_all.divide(data_all.sum(axis=0), axis=1) * 1e6 keep = (cpm_all > .5).sum(axis=1) > 5 the_dat = np.log2(data_all.loc[keep] + 1) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) ax = corr.plot_correlation_coefficient_array(the_dat, vmin=0.4, ax=ax) plt.setp(ax.xaxis.get_ticklabels(), rotation=90) fig.tight_layout() fig.savefig(os.path.join(outdir, 'corr_coeff.png'), dpi=200)
subgroups = { 'RTK I': ['019', '030', '031'], 'RTK II': ['017', '050', '054'], } intersecter = lambda x, y: set(x).intersection(y) unioner = lambda x, y: set(x).union(y) # Load RNA-Seq from STAR rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID') # load additional references if required h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID') h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID') rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj]) # discard unmapped, etc rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains('ENSG')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('PSC')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('fibroblast')] rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index] # load RNA-Seq from Salmon (for normalised comparison) # disabled for now if False: salmon_dat = rnaseq_data.load_salmon_by_patient_id(pids) idx = salmon_dat.index.str.replace(r'.[0-9]+$', '') salmon_dat.index = idx fn = os.path.join(LOCAL_DATA_DIR, 'reference_genomes', 'human', 'ensembl', 'GRCh38.p10.release90', 'gene_to_transcript.txt')
if njob != 1: pool = mp.Pool(njob) # Load RNA-Seq from STAR rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID') # load additional references if required refs = [('H1', rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')), ('H9', rnaseq_data.gse61794(annotate_by='Ensembl Gene ID', collapse_replicates=False))] all_refs = [t[0] for t in refs] + ['GIBCO'] rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj] + [t[1] for t in refs]) # only keep gene counts rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains( 'ENSG')] # discard iPSC rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('PSC')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('NHF1-hTERT')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('fibroblast')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('ESC')]