Пример #1
0
    # GSE61794 (H9-derived NSC x 2)
    obj61794 = rnaseq_data.gse61794(source='star',
                                    annotate_by='Ensembl Gene ID')
    # combining replicates
    rc = obj61794.meta.read_count.sum()
    obj61794.meta = pd.DataFrame(data={
        'cell_type': 'NSC',
        'srr': 'SRR1586371-2',
        'read_count': rc,
        'sample': 'H9 NSC',
    },
                                 index=['SRR1586371-2'])
    obj61794.data = pd.DataFrame(obj61794.data.sum(axis=1), columns=['H9 NSC'])

    # WTCHG ALL samples
    objwtchg_all = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID',
                                               include_derived=True)
    to_keep_wtchg = (
        'GIBCO_NSC_P4',
        # 'DURA018_NSC_N2_P6',
        # 'DURA018_NSC_N4_P4',
        'DURA019_NSC_N8C_P2',
        'DURA030_NSC_N16B6_P1',
        'DURA031_NSC_N44B_P2')

    # rRNA gene IDs
    rrna_ensg = set(gtf_reader.get_rrna())

    # MT gene_ids
    mt_ensg = set(gtf_reader.get_mitochondrial())

    # combine the data
Пример #2
0
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import rankdata

from load_data import rnaseq_data
from stats import transformations
from utils.output import unique_output_dir
from utils.reference_genomes import ensembl_to_gene_symbol, gene_symbol_to_ensembl

if __name__ == "__main__":
    outdir = unique_output_dir("tom_qpcr", reuse_empty=True)
    ref = 'GIBCO_NSC_P4'

    obj = rnaseq_data.all_hgic_loader(annotate_by="Ensembl Gene ID")
    dat = obj.data.loc[obj.data.index.str.contains('ENSG')]
    dat = dat.loc[:, ~obj.meta.index.str.contains('DURA')]
    # normalised version (by number of aligned reads)
    dat_n = dat.divide(dat.sum(axis=0), axis=1) * 1e6

    # remove any absent / mostly absent genes
    median_count = dat_n.median(axis=1).sort_values()
    keep_idx = median_count.loc[median_count != 0].index

    dat = dat.loc[keep_idx]
    dat_n = dat_n.loc[keep_idx]
    median_count = median_count.loc[keep_idx]

    # remove any genes that are (mostly) absent in NSC
    nsc_missing = dat.loc[:, ref] < 10.
Пример #3
0
import os
from load_data import rnaseq_data
import pandas as pd
from rnaseq import filter
from matplotlib import pyplot as plt
from plotting import clustering
from stats.transformations import median_absolute_deviation, variance_stabilizing_transform
import numpy as np
from utils.output import unique_output_dir

if __name__ == "__main__":
    outdir = unique_output_dir("compare_gibco_h9")
    loader_hgic = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID')
    loader_h9 = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID')

    genes = loader_hgic.data.index.intersection(loader_h9.data.index)
    genes = genes[genes.str.contains('ENSG')]

    # collapse H9 replicates
    h9_data = loader_h9.data.sum(axis=1).loc[genes]
    h9_data.name = 'H9_NSC'
    h9_meta = pd.Series(
        {
            'type': 'NSC',
            'read_count': sum(loader_h9.meta.read_count),
            'sample': 'H9_NSC',
            'disease_subgroup': 'control',
        },
        name='H9_NSC')

    data = pd.concat((loader_hgic.data.loc[genes], h9_data), axis=1)