Пример #1
0
def load_rnaseq(pids, ref_names, ref_name_filter='NSC', discard_filter='IPSC', strandedness=None):
    """
    :param strandedness: Iterable of same length as ref_names giving the strandedness of each ref
    """
    if strandedness is None:
        strandedness = ['u'] * len(ref_names)
    else:
        if len(strandedness) != len(ref_names):
            raise ValueError("Supplied strandedness must be a list of the same length as the ref_names.")

    # Load RNA-Seq from STAR
    obj = rnaseq_loader.load_by_patient(pids)

    # load additional references
    ref_objs = []
    for rn, strnd in zip(ref_names, strandedness):
        ref_obj = rnaseq_loader.load_references(rn, strandedness=strnd)
        if ref_name_filter is not None:
            # only keep relevant references
            ref_obj.meta = ref_obj.meta.loc[ref_obj.meta.index.str.contains(ref_name_filter)]
            ref_obj.data = ref_obj.data.loc[:, ref_obj.meta.index]
        ref_objs.append(ref_obj)
    obj = loader.MultipleBatchLoader([obj] + ref_objs)

    if discard_filter is not None:
        if not hasattr(discard_filter, '__iter__'):
            discard_filter = [discard_filter]
        for d in discard_filter:
            obj.meta = obj.meta.loc[~obj.meta.index.str.contains(d)]
            obj.data = obj.data.loc[:, obj.meta.index]

    obj.batch_id = obj.batch_id.loc[obj.meta.index]

    return obj
def prepare_gct_files_hgic(pids=consts.ALL_PIDS, outdir=None):
    """
    Prepare the GCT files required to perform classification of the hGIC samples:
    - hGIC FFPE
    - hGIC cell culture
    - Both combined
    In all cases, use FPKM units (cufflinks), TPM (salmon) and CPM (STAR).
    Use gene symbols as these are contained in the signatures.
    """
    if outdir is None:
        outdir = output.unique_output_dir()

    infiles = []

    loaded = {}
    for typ in ('cell_culture', 'ffpe'):
        for src in ('star', 'salmon', 'star/cufflinks'):
            this_obj = loader.load_by_patient(pids, type=typ, source=src, include_control=False)
            this_obj.filter_samples(this_obj.meta.type == 'GBM')
            if typ == 'ffpe':
                # restrict to the 'best' versions (there are some duplicates where we tried twice)
                this_obj.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES_ALL)
            this_dat = reference_genomes.translate_quantification_resolving_duplicates(
                this_obj.data,
                'Ensembl Gene ID',
                'Approved Symbol'
            )
            loaded.setdefault(typ, {})[src] = this_dat
            fn = os.path.join(outdir, "%s_%s.gct" % (SRC_MAP[src], typ))
            gsea.data_to_gct(this_dat, fn)
            infiles.append(fn)

    return infiles
Пример #3
0
from rnaseq import loader, filter, general
from utils import output
import os
import numpy as np

if __name__ == "__main__":
    min_cpm = 1
    obj = loader.load_by_patient('all', type='ffpe')
    samples = [
        'NH15_1661DEF2C',
        'NH15_1877_SP1C',
        'NH15_2101_DEF1A',
        'NH16_270_DEF1Ereplacement',
        'NH16_616DEF1B',
        'NH16_677_SP1A',
        'NH16_1574DEF1A',
        'NH16_1976_DEF1Areplacement',
        'NH16_2063_DEF1Areplacement',
        'NH16_2214DEF1A',
        'NH16_2255DEF1B2',
        'NH16_2806DEF3A1',
    ]

    # remove duplicates
    dat = obj.data.loc[:, samples]
    dat = filter.filter_by_cpm(dat, min_cpm=min_cpm, min_n_samples=1)
    cpm = (dat + 1).divide(dat.sum() + 1, axis=1) * 1e6
    general.add_gene_symbols_to_ensembl_data(cpm)

    outdir = output.unique_output_dir("ffpe_logcpm_values")
    mtor_geneset = mtor_gs_dict[mtor_source]
    tam_genesets = tam_gs_dict[tam_signature_source]

    genesets = dict(tam_genesets)
    genesets['mTOR'] = mtor_geneset

    subgroups = consts.SUBGROUPS

    subgroups_lookup = {}
    for grp, arr in subgroups.items():
        subgroups_lookup.update(dict([(t, grp) for t in arr]))

    outdir = output.unique_output_dir()

    obj = loader.load_by_patient(consts.PIDS,
                                 type='ffpe',
                                 source='salmon',
                                 include_control=False)
    obj.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES)
    obj.meta.insert(0, 'patient_id', nh_id_to_patient_id(obj.meta.index))
    obj.meta.insert(1, 'subgroup',
                    [subgroups_lookup[pid] for pid in obj.meta.patient_id])

    rnaseq_dat = obj.data.copy()
    # use gene symbol identifiers
    gs = reference_genomes.ensembl_to_gene_symbol(rnaseq_dat.index).dropna()
    rnaseq_dat = rnaseq_dat.loc[gs.index]
    rnaseq_dat.index = gs.values

    groups = obj.meta.subgroup
    group_list = groups.unique()
    min_cpm_individual = 0.1

    outdir = output.unique_output_dir("james_opc_smartseq2_vs_polya")

    ## 1) STAR CPM estimates

    ss2_obj = loader.load_references('wtchg_p180059', strandedness='u')
    assigned_sum = ss2_obj.data.sum()
    unassigned_sum = ss2_obj.data_unassigned.drop('N_unmapped').sum()

    ss2_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100.

    print "SmartSeq2 samples % assigned"
    print ss2_pct_assigned

    polya_obj = loader.load_by_patient(pids)

    # restrict to relevant samples for first part of the analysis
    idx = (polya_obj.meta.type == 'iNSC')
    polya_nsc_meta = polya_obj.meta.loc[idx]
    polya_nsc_data = polya_obj.data.loc[:, polya_nsc_meta.index]
    polya_nsc_unassigned = polya_obj.data_unassigned.loc[:,
                                                         polya_nsc_meta.index]

    assigned_sum = polya_nsc_data.sum()
    unassigned_sum = polya_nsc_unassigned.drop('N_unmapped').sum()

    polya_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100.

    print "Poly(A) samples % assigned"
    print polya_pct_assigned
Пример #6
0
DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')

de_params = consts.DE_PARAMS
dmr_params = consts.DMR_PARAMS
dmr_params['n_jobs'] = mp.cpu_count()

norm_method_s1 = 'swan'
min_cpm = 1.

min_counts = 10000000

pids = consts.PIDS

# load RNA-Seq data
rna_ff_obj = rnaseq_loader.load_by_patient(pids,
                                           type='ffpe',
                                           source='star',
                                           include_control=False)

# filter FFPE to include only the best samples (NB not actually good!)
rna_ff_obj.filter_samples(
    rna_ff_obj.meta.index.isin(consts.FFPE_RNASEQ_SAMPLES))
rna_ff_obj.batch_id = rna_ff_obj.meta.batch

# add FFPE PID
nh_id = rna_ff_obj.meta.index.str.replace(r'(_?)(DEF|SP).*', '')
p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id]
rna_ff_obj.meta.insert(0, 'nh_id', nh_id)
rna_ff_obj.meta.insert(0, 'patient_id', p_id)
rna_ff_obj.batch_id = rna_ff_obj.meta.batch

# reject samples with low counts
                                      dmr_hash_dict)
    filename = 'dmr_results_paired_comparison.%d.pkl' % the_hash
    fn = os.path.join(DMR_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Reading S1 DMR results from %s", fn)
        dmr_res_s1 = dmr.DmrResultCollection.from_pickle(fn, anno=anno)
    else:
        raise AttributeError(
            "Unable to load pre-computed DMR results, expected at %s" % fn)

    # extract results
    dmr_res_full_s1 = dmr_res_s1.results
    dmr_res_sign_s1 = dmr_res_s1.results_significant

    rnaseq_obj = obj = rnaseq_loader.load_by_patient(pids)

    rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES)

    # only keep the required syngeneic samples for this analysis
    dat_s1 = rnaseq_obj.data
    meta_s1 = rnaseq_obj.meta

    the_hash = tscdd.de_results_hash(meta_s1.index.tolist(), de_params)
    filename = 'de_results_paired_comparison.%d.pkl' % the_hash
    fn = os.path.join(DE_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Reading S1 DE results from %s", fn)
        with open(fn, 'rb') as f:
            de_res_full_s1 = pickle.load(f)
Пример #8
0
from settings import INTERMEDIATE_DIR

import os
import pickle
import re
import pandas as pd
import statsmodels.formula.api as sm

logger = log.get_console_logger()

if __name__ == '__main__':
    outdir = output.unique_output_dir()
    DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr')
    DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')

    rnaseq_obj = loader.load_by_patient(consts.PIDS)
    rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES)

    dat_s1 = rnaseq_obj.data
    meta_s1 = rnaseq_obj.meta.loc[dat_s1.columns]

    the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), consts.DE_PARAMS)
    filename = 'de_results_paired_comparison.%d.pkl' % the_hash
    fn = os.path.join(DE_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Reading S1 DE results from %s", fn)
        with open(fn, 'rb') as f:
            de_res_full_s1 = pickle.load(f)
    else:
        raise NotImplementedError(
    load_kwds = {'source': source, 'alignment_subdir': SetMe}
    if source == 'salmon':
        units = 'tpm'
        load_kwds['units'] = 'tpm'
    if source == 'star':
        # set strandedness as a cue to import for each
        load_kwds['strandedness'] = SetMe

    # restrict samples manually to avoid changes going forwards
    our_samples = consts.S1_RNASEQ_SAMPLES_INSC + consts.S1_RNASEQ_SAMPLES_IPSC + consts.S1_RNASEQ_SAMPLES_FB + [
        'GIBCO_NSC_P4'
    ]

    # our data (everything)

    obj = loader.load_by_patient(pids, source=source)
    # obj.filter_by_sample_name(our_samples)

    # HipSci data
    hip_obj = loader.hipsci_ipsc(aggregate_to_gene=True)
    hip_obj.meta.insert(3, 'batch', hip_obj.batch_id)
    # hip_obj.meta.insert(3, 'batch', 'HipSci')

    # reduce the number in a (repeatably) random fashion
    rs = np.random.RandomState(
        42)  # set the seed so we always get the same samples
    keep = np.zeros(hip_obj.meta.shape[0]).astype(bool)
    idx = hip_obj.meta.index.tolist()
    rs.shuffle(idx)
    idx = idx[:n_hipsci]
    hip_obj.meta = hip_obj.meta.loc[idx]
Пример #10
0
    else:
        raise AttributeError(
            "Unable to load pre-computed DMR results, expected at %s" % fn)

    # extract results
    dmr_res_full_s1 = dmr_res_s1.results
    dmr_res_sign_s1 = dmr_res_s1.results_significant

    # get samples used in each comparison
    dmr_comparison_groups = collections.OrderedDict([(pid, {})
                                                     for pid in consts.PIDS])
    gg = me_data.columns.groupby(zip(me_meta.patient_id, me_meta.type))
    for (pid, typ), samples in gg.items():
        dmr_comparison_groups[pid][typ] = samples

    rnaseq_obj = rnaseq_loader.load_by_patient(
        pids)  # quicker than tscdd method that loads refs too

    # rnaseq_obj = tscdd.load_rnaseq(
    #     pids,
    #     external_ref_names_de,
    #     strandedness=external_ref_strandedness_de,
    # )

    rnaseq_obj.filter_by_sample_name(consts.ALL_RNASEQ_SAMPLES)

    # only keep the required syngeneic samples for this analysis
    dat_s1 = rnaseq_obj.data.loc[:,
                                 rnaseq_obj.meta.index.isin(consts.
                                                            S1_RNASEQ_SAMPLES)]
    meta_s1 = rnaseq_obj.meta.loc[dat_s1.columns]
    ]

    row_per_fig = 5

    # load TPM data
    cols_syn_gic = consts.S1_RNASEQ_SAMPLES_GIC
    cols_syn_insc = consts.S1_RNASEQ_SAMPLES_INSC
    cols_ref_nsc = [
        'GIBCO_NSC_P4',
        'H9_NSC_1',
        'H9_NSC_2'
    ]

    outdir = output.unique_output_dir()

    obj1 = loader.load_by_patient(pids, source='salmon', include_control=True)
    obj2 = loader.load_references('GSE61794', source='salmon')
    obj = loader.MultipleBatchLoader([obj1, obj2])
    obj.filter_by_sample_name(consts.ALL_RNASEQ_SAMPLES)

    dat = obj.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat)

    # check that all GOIs are present
    vc = dat['Gene Symbol'].value_counts()
    for g in gois:
        if g not in vc:
            raise KeyError("Gene %s was not found." % g)
        if vc[g] != 1:
            raise AttributeError("Gene %s has %d hits." % (g, vc[g]))
    meta_fn = os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm',
                           'primary_tumour/htseq-count_fpkm/sources.csv')
    dat_fn = os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm',
                          'primary_tumour/htseq-count_fpkm/fpkm.csv')
    tcga_meta = pd.read_csv(meta_fn, header=0, index_col=0)
    tcga_dat = pd.read_csv(dat_fn, header=0, index_col=0)

    # filter: primary GBM only
    ix = (tcga_meta.idh1_status == 'WT')
    tcga_meta = tcga_meta[ix]
    tcga_dat = tcga_dat[tcga_meta.index]
    tcga_dat = tcga_dat.divide(tcga_dat.sum(axis=0), axis=1) * 1e6

    # load our data
    gic_obj = loader.load_by_patient(consts.PIDS,
                                     source='salmon',
                                     include_control=False,
                                     type='cell_culture')
    ffpe_obj = loader.load_by_patient(consts.PIDS,
                                      source='salmon',
                                      include_control=False,
                                      type='ffpe')

    # add NH ID and patient ID to FFPE
    nh_id = ffpe_obj.meta.index.str.replace(r'(_?)(DEF|SP).*', '')
    p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id]
    ffpe_obj.meta.insert(0, 'nh_id', nh_id)
    ffpe_obj.meta.insert(0, 'patient_id', p_id)

    # ditto GIC
    gic_obj.meta.insert(
        0, 'patient_id',
Пример #13
0
import os
from plotting import clustering, common, pca
from sklearn.decomposition.pca import PCA
from stats import transformations
import numpy as np
from utils import output
import os
import pandas as pd
from matplotlib import pyplot as plt


if __name__ == "__main__":
    eps = 1e-2

    outdir = output.unique_output_dir("export_sb_data")
    obj_star = loader.load_by_patient(['ICb1299', '3021'], source='star', type='cell_culture', include_control=False)
    obj_salmon = loader.load_by_patient(['ICb1299', '3021'], source='salmon', type='cell_culture', include_control=False)

    # cluster plot
    tpm = filter.filter_by_cpm(obj_salmon.data, min_cpm=1, min_n_samples=4)

    batch_colours = common.COLOUR_BREWERS[len(obj_salmon.meta.batch.unique())]
    line_colours = common.COLOUR_BREWERS[2]
    cc = pd.DataFrame(line_colours[0], index=tpm.columns, columns=['Batch', 'Cell line'])

    aa, bb = obj_salmon.meta.batch.factorize()
    for i in range(aa.max()):
        cc.loc[aa == i, 'Batch'] = batch_colours[i]
    cc.loc[cc.index.str.contains('3021'), 'Cell line'] = line_colours[1]

    cg = clustering.dendrogram_with_colours(
Пример #14
0
        '017', '018', '019', '030', '031', '026', '044', '049', '050', '052',
        '054', '061'
    ]
    if units == 'tpm':
        min_val = 1
        min_n = 4
        eps = .01
    elif units == 'estimated_counts':
        min_val = 10
        min_n = 4
        eps = .01

    if remove_mt:
        mt_ensg = set(gtf_reader.get_mitochondrial())

    patient_obj = loader.load_by_patient(pids, source='salmon', units=units)
    patient_data = patient_obj.data

    # discard GBM and unused 016 iNSC
    patient_data = patient_data.loc[:,
                                    ~patient_data.columns.str.contains('GBM')]
    patient_data = patient_data.drop(
        ['DURA061_NSC_N6_P4', 'DURA061_NSC_N1_P5'], axis=1)

    # discard mitochondrial genes
    if remove_mt:
        idx = ~patient_data.index.isin(mt_ensg)
        pdbg = patient_data.loc[idx]
        # renorm
        if units == 'tpm':
            pdbg = pdbg.divide(pdbg.sum(), axis=1) * 1e6
Пример #15
0
    )
    dedm_indir = os.path.join(
        HGIC_LOCAL_DIR,
        'current/core_pipeline/rnaseq_methylation_combined/s0_individual_patients_direct_comparison/ipa/pathways'
    )

    DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')

    outdir = output.unique_output_dir()

    #######################################################
    # DE
    #######################################################
    # data
    rnaseq_obj = rnaseq_loader.load_by_patient(pids,
                                               include_control=False,
                                               source='star')
    rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES)

    dat_s1 = rnaseq_obj.data
    meta_s1 = rnaseq_obj.meta

    cpm = dat_s1.divide(dat_s1.sum(axis=0), axis=1) * 1e6

    # DE results
    the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), de_params)
    filename = 'de_results_paired_comparison.%d.pkl' % the_hash
    fn = os.path.join(DE_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Reading S1 DE results from %s", fn)
Пример #16
0
    # load all data
    pids = ['018', '019', '030', '031', '017', '050', '054', '061', '026', '052']
    units = 'tpm'

    out_subdir = os.path.join(outdir, units)
    if not os.path.isdir(out_subdir):
        os.makedirs(out_subdir)
        print "Created output subdirectory %s" % out_subdir

    source_by_units = {
        'tpm': 'salmon',
        'counts': 'star',
        'fpkm': 'star/cufflinks'
    }

    obj = loader.load_by_patient(pids, source=source_by_units[units], include_control=True)

    # set gibco aside
    dat_gibco = obj.data.loc[:, obj.data.columns.str.contains('GIBCO')]
    dat_gibco = ens_index_to_gene_symbol(dat_gibco)

    # drop any cell types other than GBM and iNSC
    ix = obj.meta['type'].isin(['GBM', 'iNSC'])
    # drop unneeded GBM061 samples
    ix = ix & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4']))
    obj.filter_samples(ix)

    # convert to gene symbols
    dat = ens_index_to_gene_symbol(obj.data)

    # load reference dataset(s)
Пример #17
0
    elif type == 'csv':
        save_func = lambda x: x.to_csv
    else:
        raise NotImplementedError("Unsupported type %s." % type)

    save_func(dat)(fn)


if __name__ == "__main__":
    outdir = output.unique_output_dir()
    keep_samples = consts.S1_RNASEQ_SAMPLES_FB + consts.S1_RNASEQ_SAMPLES_INSC + consts.S1_RNASEQ_SAMPLES_GIC + \
        consts.S1_RNASEQ_SAMPLES_IPSC + consts.S1_RNASEQ_SAMPLES_IAPC

    star_cc_obj = loader.load_by_patient(
        consts.PIDS,
        type='cell_culture',
        source='star',
        include_control=True,
    )
    ix = star_cc_obj.meta.index.isin(keep_samples)
    star_cc_obj.filter_samples(ix)

    salmon_cc_obj = loader.load_by_patient(
        consts.PIDS,
        type='cell_culture',
        source='salmon',
        include_control=True,
    )
    ix = salmon_cc_obj.meta.index.isin(keep_samples)
    salmon_cc_obj.filter_samples(ix)

    star_ff_obj = loader.load_by_patient(
    quantile = 0.99

    # dimensions (components) to investigate
    dims = [0, 1, 2]

    selection_radii_for_plotting = {0: 0.6, 1: 0.30, 2: 0.25}

    # path to syngeneic DE results
    fn_de_res = os.path.join(HGIC_LOCAL_DIR, 'current/core_pipeline/rnaseq/',
                             'full_de_syngeneic_only.xlsx')

    # load DE results
    de_res = pd.read_excel(fn_de_res, index_col=0)

    # load data for iNSC and GBM (Salmon TPM)
    obj = loader.load_by_patient(consts.PIDS, source='salmon')
    obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES_GIC +
                              consts.S1_RNASEQ_SAMPLES_INSC)
    obj.meta.insert(
        0, 'patient_id',
        obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                   '\g<pid>'))

    cmap = common.get_best_cmap(len(consts.PIDS))
    scatter_colours = dict(zip(consts.PIDS, cmap))

    scatter_markers = {'GBM': 's', 'iNSC': 'o'}

    # scaling parameter applied during SVD
    scale_preserved = 0.05
Пример #19
0
    de_params = consts.DE_PARAMS
    dmr_params = consts.DMR_PARAMS
    dmr_params['n_jobs'] = mp.cpu_count()

    # file location
    DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr')
    DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')

    # boilerplate
    outdir = output.unique_output_dir()
    logger = log.get_console_logger()

    ## DE

    # load data (ours only, no references)
    rnaseq_obj = rnaseq_loader.load_by_patient(pids, include_control=False)
    rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES)

    dat_s1 = rnaseq_obj.data
    meta_s1 = rnaseq_obj.meta

    the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), de_params)
    filename = 'de_results_s1_cross_comparison.%d.pkl' % the_hash
    fn = os.path.join(DE_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Reading S1 cross-comparison DE results from %s", fn)
        with open(fn, 'rb') as f:
            de_res = pickle.load(f)
    else:
        groups = pd.Series(index=meta_s1.index)
Пример #20
0
import os

import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

import hgic_consts
from plotting import clustering
from rnaseq import loader
from stats import transformations
from utils import output, reference_genomes

if __name__ == '__main__':
    outdir = output.unique_output_dir("cluster_gic_ffpe")
    pids = ['018', '019', '031', '017', '050', '054']
    obj_ffpe = loader.load_by_patient(pids, type='ffpe', include_control=False)
    obj_gic = loader.load_by_patient(pids,
                                     type='cell_culture',
                                     include_control=False)
    obj = loader.loader.MultipleBatchLoader([obj_ffpe, obj_gic])

    # drop iNSC, iPSC
    obj.meta = obj.meta.loc[~obj.meta.index.str.contains('DURA')]
    obj.data = obj.data.loc[:, obj.meta.index]

    # relabel the FFPE samples
    idx = obj.meta.index.tolist()
    for k, v in hgic_consts.NH_ID_TO_PATIENT_ID_MAP.items():
        for i, t in enumerate(idx):
            if k.replace('-', '_') in t:
                idx[i] = "FFPE GBM%s" % v
Пример #21
0
    # to_aggr_nsc = [
    #     (r'H9_NSC_[12]', 'H9 NSC'),
    #     # (r'Pollard NSC [12]', 'Fetal NSC'),
    # ]

    outdir = output.unique_output_dir("assess_reprogramming_de")

    load_kwds = {
        'source': 'star',
        'alignment_subdir': SetMe,
        'strandedness': SetMe,
    }

    # our data (everything)
    obj = loader.load_by_patient(pids, source='star')

    # HipSci data
    hip_obj = loader.hipsci_ipsc(aggregate_to_gene=True)
    hip_obj.meta.insert(3, 'batch', hip_obj.batch_id)
    # hip_obj.meta.insert(3, 'batch', 'HipSci')

    # reduce the number in a (repeatably) random fashion
    rs = np.random.RandomState(
        42)  # set the seed so we always get the same samples
    keep = np.zeros(hip_obj.meta.shape[0]).astype(bool)
    idx = hip_obj.meta.index.tolist()
    rs.shuffle(idx)
    idx = idx[:n_hipsci]
    hip_obj.meta = hip_obj.meta.loc[idx]
    hip_obj.data = hip_obj.data.loc[:, idx]
Пример #22
0
    dat = dat + offset
    if len(dat.shape) == 2:
        cpm = dat.divide(dat.sum(), axis=1) * 1e6
    else:
        cpm = dat.divide(dat.sum()) * 1e6
    return np.log(cpm) / np.log(base)


if __name__ == '__main__':
    min_cpm = 0.01

    outdir = output.unique_output_dir("biological_technical_ecdf")

    # all our patient data (cell culture)

    our_patient_obj = loader.load_by_patient('all', source='star')

    # all our patient data (FFPE culture)
    ffpe_samples = [
        'NH15_1661DEF2C',
        'NH15_1877_SP1C',
        'NH15_2101_DEF1A',
        'NH16_270_DEF1Ereplacement',
        'NH16_616DEF1B',
        'NH16_677_SP1A',
        'NH16_1574DEF1A',
        'NH16_1976_DEF1Areplacement',
        'NH16_2063_DEF1Areplacement',
        'NH16_2214DEF1A',
        'NH16_2255DEF1B2',
        'NH16_2806DEF3A1',
Пример #23
0
    @property
    def kde_func(self):
        return eval_one_kde_gaussian

    def run_normalisation(self, njob=None):
        self.Fr = self.X.rank(axis=1, method='average') / float(self.n)
        self.z_ij = self.Fr.rank(axis=0, method='average')

    def plot_kde_one_gene(self, gene, n_annot=10, gene_ttl=None):
        raise NotImplementedError("TODO: refactor this part")



if __name__ == "__main__":
    outdir = output.unique_output_dir()
    obj = loader.load_by_patient(['018', '019', '030', '031'], source='star')
    X = obj.data
    r = 0.5  # offset for Poisson kernels
    tau = 1.  # weighting in KS statistic

    # arbitrary gene set: GBM signalling

    gk = [
        'ENSG00000077782',
        'ENSG00000133056',
        'ENSG00000136158',
        'ENSG00000110092',
        'ENSG00000169032',
        'ENSG00000139687',
        'ENSG00000171862',
        'ENSG00000140443',
    dmr_hash_dict['norm_method'] = norm_method_s1

    # load DMR results
    the_hash = tsgd.dmr_results_hash(meth_obj.meta.index.tolist(),
                                     dmr_hash_dict)
    filename = 'dmr_results_paired_comparison.%d.pkl' % the_hash
    fn = os.path.join(DMR_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Loading pre-computed DMR results from %s", fn)
        dmr_res_s1 = dmr.DmrResultCollection.from_pickle(fn, anno=anno)
    else:
        raise Exception("Unable to locate pre-existing results.")

    # load DE results
    rnaseq_obj = rnaseq_loader.load_by_patient(consts.PIDS,
                                               include_control=False)
    rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES)

    the_hash = tsgd.de_results_hash(rnaseq_obj.meta.index.tolist(), de_params)
    filename = 'de_results_paired_comparison.%d.pkl' % the_hash
    fn = os.path.join(DE_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Reading S1 DE results from %s", fn)
        with open(fn, 'rb') as f:
            de_res_s1 = pickle.load(f)
    else:
        raise Exception("Unable to locate pre-existing DE results.")

    the_hash = tsgd.dmr_results_hash(meth_obj.meta.index.tolist(),
                                     dmr_hash_dict)
Пример #25
0
        'CAB39L', 'STRADA', 'RICTOR', 'EIF4E1B', 'TSC1'
    ]

    # which list should we use?
    list_name = 'S2'
    # list_name = 'S4'

    if list_name == 'S2':
        list_cols = ['MG', 'BMDM']
    elif list_name == 'S4':
        list_cols = ['TAM MG', 'TAM BMDM', 'Core MG', 'Core BMDM']
    else:
        raise NotImplementedError("Unrecognised list: %s" % list_name)

    # load FFPE RNA-Seq data
    obj_ff = loader.load_by_patient('all', source='salmon', type='ffpe', include_control=False)

    # add patient identifiers
    nh_id = obj_ff.meta.index.str.replace(r'(_?)(DEF|SP).*', '')
    p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id]
    obj_ff.meta.insert(0, 'nh_id', nh_id)
    obj_ff.meta.insert(0, 'patient_id', p_id)

    # switch to gene symbols
    gs = reference_genomes.ensembl_to_gene_symbol(obj_ff.data.index)
    gs = gs.loc[~gs.index.duplicated()]
    the_ix = np.array(obj_ff.data.index, copy=True)
    the_ix[~gs.isnull().values] = gs.values[~gs.isnull()]
    ffpe_dat = obj_ff.data.copy()
    ffpe_dat.index = the_ix
Пример #26
0
    subgroup_set_colours = {
        'RTK I full': '#0d680f',
        'RTK II full': '#820505',
        'MES full': '#7900ad',
        'RTK I partial': '#6ecc70',
        'RTK II partial': '#d67373',
        'MES partial': '#cc88ea',
        'mixed': '#4C72B0',
        'specific': '#f4e842',
    }

    min_cpm = 1

    outdir = output.unique_output_dir("compare_de_gene_counts_s1",
                                      reuse_empty=True)
    obj = loader.load_by_patient(pids, include_control=False)

    # remove IPSC and rejected 061 samples for good
    idx = ((~obj.meta.index.str.contains('IPSC'))
           &
           (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4'])))
    obj.meta = obj.meta.loc[idx]
    obj.data = obj.data.loc[:, idx]
    obj.batch_id = obj.batch_id.loc[idx]

    # we'll run everything with two different edgeR tests

    methods = ('GLM', 'QLGLM')

    res_1 = {}
    res_2 = {}
from stats import nht
from utils import output, setops, reference_genomes

if __name__ == "__main__":
    outdir = output.unique_output_dir()
    pids = consts.PIDS
    DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')

    eps = .1  # offset for log transform

    target_gene = 'CD274'
    target_ens = reference_genomes.gene_symbol_to_ensembl(target_gene)

    # load Salmon data

    obj_cc = loader.load_by_patient(pids, source='salmon')
    ix = obj_cc.meta.index.isin(consts.S1_RNASEQ_SAMPLES)
    obj_cc.filter_samples(ix)

    # add PID to cell culture metadata
    obj_cc.meta.insert(
        0, 'pid',
        obj_cc.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                      '\g<pid>'))

    obj_ff = loader.load_by_patient(pids, source='salmon', type='ffpe')
    obj_ff.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES)

    # add PID to FFPE metadata
    nh_id = obj_ff.meta.index.str.replace(r'(_?)(DEF|SP).*', '')
    p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id]