'method': 'GLM'
    }

    subgroups = {
        'RTK I': ['019', '030', '031'],
        'RTK II': ['017', '050', '054'],
    }

    intersecter = lambda x, y: set(x).intersection(y)
    unioner = lambda x, y: set(x).union(y)

    # Load RNA-Seq from STAR
    rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID')

    # load additional references if required
    h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID')
    h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')
    rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj])

    # discard unmapped, etc
    rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains('ENSG')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('PSC')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('fibroblast')]
    rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index]

    # load RNA-Seq from Salmon (for normalised comparison)
    # disabled for now
    if False:
        salmon_dat = rnaseq_data.load_salmon_by_patient_id(pids)
        idx = salmon_dat.index.str.replace(r'.[0-9]+$', '')
        salmon_dat.index = idx
示例#2
0
import os
from load_data import rnaseq_data
import pandas as pd
from rnaseq import filter
from matplotlib import pyplot as plt
from plotting import clustering
from stats.transformations import median_absolute_deviation, variance_stabilizing_transform
import numpy as np
from utils.output import unique_output_dir

if __name__ == "__main__":
    outdir = unique_output_dir("compare_gibco_h9")
    loader_hgic = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID')
    loader_h9 = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID')

    genes = loader_hgic.data.index.intersection(loader_h9.data.index)
    genes = genes[genes.str.contains('ENSG')]

    # collapse H9 replicates
    h9_data = loader_h9.data.sum(axis=1).loc[genes]
    h9_data.name = 'H9_NSC'
    h9_meta = pd.Series(
        {
            'type': 'NSC',
            'read_count': sum(loader_h9.meta.read_count),
            'sample': 'H9_NSC',
            'disease_subgroup': 'control',
        },
        name='H9_NSC')

    data = pd.concat((loader_hgic.data.loc[genes], h9_data), axis=1)
示例#3
0
        'SLC1A3': 4170,
    }

    OUTDIR = unique_output_dir("jb.marker_levels", reuse_empty=True)

    # GSE73721 (reference astrocytes, oligos, ...)
    obj73721 = rnaseq_data.gse73721(source='star',
                                    annotate_by='Ensembl Gene ID')

    # remove unneeded samples
    to_keep73721 = (obj73721.data.columns.str.contains('yo ctx astro')
                    | obj73721.data.columns.str.contains('Hippocampus astro')
                    | obj73721.data.columns.str.contains('oligo'))

    # GSE61794 (H9-derived NSC x 2)
    obj61794 = rnaseq_data.gse61794(source='star',
                                    annotate_by='Ensembl Gene ID')
    # combining replicates
    rc = obj61794.meta.read_count.sum()
    obj61794.meta = pd.DataFrame(data={
        'cell_type': 'NSC',
        'srr': 'SRR1586371-2',
        'read_count': rc,
        'sample': 'H9 NSC',
    },
                                 index=['SRR1586371-2'])
    obj61794.data = pd.DataFrame(obj61794.data.sum(axis=1), columns=['H9 NSC'])

    # WTCHG ALL samples
    objwtchg_all = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID',
                                               include_derived=True)
    to_keep_wtchg = (
    }

    intersecter = lambda x, y: set(x).intersection(y)
    unioner = lambda x, y: set(x).union(y)

    if njob != 1:
        pool = mp.Pool(njob)

    # Load RNA-Seq from STAR
    rnaseq_obj = rnaseq_data.load_by_patient(pids,
                                             annotate_by='Ensembl Gene ID')

    # load additional references if required
    refs = [('H1', rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')),
            ('H9',
             rnaseq_data.gse61794(annotate_by='Ensembl Gene ID',
                                  collapse_replicates=False))]
    all_refs = [t[0] for t in refs] + ['GIBCO']

    rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj] +
                                                 [t[1] for t in refs])

    # only keep gene counts
    rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains(
        'ENSG')]

    # discard iPSC
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('PSC')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('NHF1-hTERT')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.