Пример #1
0
indir = os.path.join(DATA_DIR, 'rnaseq', 'wtchg_p160704')
lanedirs = [
    os.path.join(indir, '161219_K00198_0151_BHGYHTBBXX'),
    os.path.join(indir, '161222_K00198_0152_AHGYG3BBXX'),
]
metafiles = [os.path.join(d, 'sources.csv') for d in lanedirs]
countdirs = [os.path.join(d, 'star_alignment_mouse') for d in lanedirs]

obj2 = rnaseq_data.all_samples_multilane_loader(countdirs,
                                                metafiles,
                                                source='star',
                                                annotate_by='Ensembl Gene ID',
                                                samples=('ICb1078', 'ICb1487'))

obj = rnaseq_data.MultipleBatchLoader([obj1, obj2])

data = obj.data.loc[obj.data.index.str.contains('ENS')]
cpm = data.divide(obj.meta.read_count, axis=1) * 1e6
cpm = cpm.astype(float)
# only keep abundant genes: results in around 12,000 remaining genes
keep = (cpm > 1).sum(axis=1) > 5

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
ax = corr.plot_correlation_coefficient_array(data.loc[keep], vmin=0.6, ax=ax)
plt.setp(ax.xaxis.get_ticklabels(), rotation=90)
fig.tight_layout()

# 2) What are the correlation coeffs when we look at individual lanes?
# Only include the endogenous NSC in the endogenous medium
Пример #2
0
                                    ])
    obj52564 = rnaseq_data.gse52564(annotate_by='Ensembl Gene ID',
                                    samples=[
                                        'Astrocyte1',
                                        'Astrocyte2',
                                        'Neuron1',
                                        'Neuron2',
                                        'OPC1',
                                        'OPC2',
                                    ])
    obj43916 = rnaseq_data.gse43916(annotate_by='Ensembl Gene ID',
                                    samples=['NSCs'])
    obj86248 = rnaseq_data.gse86248(annotate_by='Ensembl Gene ID')
    obj36114 = rnaseq_data.gse36114(annotate_by='Ensembl Gene ID')

    obj_all = rnaseq_data.MultipleBatchLoader(
        [obj, obj64411, obj52564, obj43916, obj86248, obj36114])

    data_all = obj_all.data.loc[obj_all.data.index.str.contains('ENS')]
    meta_all = obj_all.meta

    cpm_all = data_all.divide(data_all.sum(axis=0), axis=1) * 1e6
    keep = (cpm_all > .5).sum(axis=1) > 5

    the_dat = np.log2(data_all.loc[keep] + 1)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    ax = corr.plot_correlation_coefficient_array(the_dat, vmin=0.4, ax=ax)
    plt.setp(ax.xaxis.get_ticklabels(), rotation=90)
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, 'corr_coeff.png'), dpi=200)
    subgroups = {
        'RTK I': ['019', '030', '031'],
        'RTK II': ['017', '050', '054'],
    }

    intersecter = lambda x, y: set(x).intersection(y)
    unioner = lambda x, y: set(x).union(y)

    # Load RNA-Seq from STAR
    rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID')

    # load additional references if required
    h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID')
    h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')
    rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj])

    # discard unmapped, etc
    rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains('ENSG')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('PSC')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('fibroblast')]
    rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index]

    # load RNA-Seq from Salmon (for normalised comparison)
    # disabled for now
    if False:
        salmon_dat = rnaseq_data.load_salmon_by_patient_id(pids)
        idx = salmon_dat.index.str.replace(r'.[0-9]+$', '')
        salmon_dat.index = idx
        fn = os.path.join(LOCAL_DATA_DIR, 'reference_genomes', 'human', 'ensembl', 'GRCh38.p10.release90',
                          'gene_to_transcript.txt')
    if njob != 1:
        pool = mp.Pool(njob)

    # Load RNA-Seq from STAR
    rnaseq_obj = rnaseq_data.load_by_patient(pids,
                                             annotate_by='Ensembl Gene ID')

    # load additional references if required
    refs = [('H1', rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')),
            ('H9',
             rnaseq_data.gse61794(annotate_by='Ensembl Gene ID',
                                  collapse_replicates=False))]
    all_refs = [t[0] for t in refs] + ['GIBCO']

    rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj] +
                                                 [t[1] for t in refs])

    # only keep gene counts
    rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains(
        'ENSG')]

    # discard iPSC
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('PSC')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('NHF1-hTERT')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('fibroblast')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('ESC')]