# GSE61794 (H9-derived NSC x 2) obj61794 = rnaseq_data.gse61794(source='star', annotate_by='Ensembl Gene ID') # combining replicates rc = obj61794.meta.read_count.sum() obj61794.meta = pd.DataFrame(data={ 'cell_type': 'NSC', 'srr': 'SRR1586371-2', 'read_count': rc, 'sample': 'H9 NSC', }, index=['SRR1586371-2']) obj61794.data = pd.DataFrame(obj61794.data.sum(axis=1), columns=['H9 NSC']) # WTCHG ALL samples objwtchg_all = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID', include_derived=True) to_keep_wtchg = ( 'GIBCO_NSC_P4', # 'DURA018_NSC_N2_P6', # 'DURA018_NSC_N4_P4', 'DURA019_NSC_N8C_P2', 'DURA030_NSC_N16B6_P1', 'DURA031_NSC_N44B_P2') # rRNA gene IDs rrna_ensg = set(gtf_reader.get_rrna()) # MT gene_ids mt_ensg = set(gtf_reader.get_mitochondrial()) # combine the data
import numpy as np import pandas as pd from matplotlib import pyplot as plt from scipy.stats import rankdata from load_data import rnaseq_data from stats import transformations from utils.output import unique_output_dir from utils.reference_genomes import ensembl_to_gene_symbol, gene_symbol_to_ensembl if __name__ == "__main__": outdir = unique_output_dir("tom_qpcr", reuse_empty=True) ref = 'GIBCO_NSC_P4' obj = rnaseq_data.all_hgic_loader(annotate_by="Ensembl Gene ID") dat = obj.data.loc[obj.data.index.str.contains('ENSG')] dat = dat.loc[:, ~obj.meta.index.str.contains('DURA')] # normalised version (by number of aligned reads) dat_n = dat.divide(dat.sum(axis=0), axis=1) * 1e6 # remove any absent / mostly absent genes median_count = dat_n.median(axis=1).sort_values() keep_idx = median_count.loc[median_count != 0].index dat = dat.loc[keep_idx] dat_n = dat_n.loc[keep_idx] median_count = median_count.loc[keep_idx] # remove any genes that are (mostly) absent in NSC nsc_missing = dat.loc[:, ref] < 10.
import os from load_data import rnaseq_data import pandas as pd from rnaseq import filter from matplotlib import pyplot as plt from plotting import clustering from stats.transformations import median_absolute_deviation, variance_stabilizing_transform import numpy as np from utils.output import unique_output_dir if __name__ == "__main__": outdir = unique_output_dir("compare_gibco_h9") loader_hgic = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID') loader_h9 = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID') genes = loader_hgic.data.index.intersection(loader_h9.data.index) genes = genes[genes.str.contains('ENSG')] # collapse H9 replicates h9_data = loader_h9.data.sum(axis=1).loc[genes] h9_data.name = 'H9_NSC' h9_meta = pd.Series( { 'type': 'NSC', 'read_count': sum(loader_h9.meta.read_count), 'sample': 'H9_NSC', 'disease_subgroup': 'control', }, name='H9_NSC') data = pd.concat((loader_hgic.data.loc[genes], h9_data), axis=1)