示例#1
0
    def __init__(self, tax_id=9606, genome_version='GRCh37', gtf_fn=None):
        self.tax_id = tax_id
        self.genome_version = genome_version
        if gtf_fn is None:
            gtf_fn = genomics.get_reference_genome_gtf(tax_id,
                                                       version=genome_version)
        self.gtf_fn = gtf_fn
        self.db = genomics.GtfAnnotation(gtf_fn)
        self.mdat = None
        self.dmr_res = None
        self.anno = None
        self.de_res = None
        self.dmr_comparison_groups = None
        self.logger = log.get_console_logger(self.__class__.__name__)

        # default plotting parameters
        self.colours = None
        self.markers = None
        self.zorder = None
        self.alpha = None
        self.size = None
        self.fig_kws = {}
        self.m_plot_kws = {}

        self.de_direction_colour = None
        self.dm_direction_colour = None
        self.dm_vmin = self.dm_vmax = None
        self.de_vmin = self.de_vmax = None

        self.set_plot_parameters()
示例#2
0
    def __init__(self,
                 tax_id=9606,
                 logger=None,
                 force_update=False,
                 go_dir=DEFAULT_GO_DIR,
                 bg_genes=None):
        # gene_converter can be used to enable automatic gene conversion
        self.gene_converter = None
        self.logger = logger or log.get_console_logger(self.__class__.__name__)
        self.tax_id = tax_id
        if not os.path.isdir(go_dir):
            self.logger.warn("Creating master GO directory at %s.", go_dir)
            os.makedirs(go_dir)
        else:
            self.logger.info("Using existing GO directory at %s.", go_dir)
        self.base_dir = go_dir

        # get filenames and parse both GAF and OBO
        self.obo_fn = self.check_and_get_obo(force_update=force_update)
        self.gaf_fn = self.check_and_get_gaf(force_update=force_update)
        self.obo = obo_parser.GODag(self.obo_fn)

        self.gaf = associations.read_ncbi_gene2go(self.gaf_fn,
                                                  taxids=[self.tax_id])
        self.logger.info("{N:,} annotated human genes".format(N=len(self.gaf)))

        self.bg_genes = bg_genes
        if self.bg_genes is not None:
            self.set_bg_genes(bg_genes)
示例#3
0
def yugene_transform(marray_data, resolve_ties=True):
    """
    Apply the YuGene transform to the supplied data.
    Le Cao, Kim-Anh, Florian Rohart, Leo McHugh, Othmar Korn, and Christine A. Wells.
    "YuGene: A Simple Approach to Scale Gene Expression Data Derived from Different Platforms for Integrated Analyses."
    Genomics 103, no. 4 (April 2014): 239-51. doi:10.1016/j.ygeno.2014.03.001.
    Assume the data are supplied with samples in columns and genes in rows
    :param resolve_ties: If True (default), replace all tied values with the mean. This is especially significant at
    low count values, which are often highly degenerate.
    """
    logger = get_console_logger(__name__)

    res = marray_data.copy()
    # add columnwise offset to ensure all positive values
    colmin = res.min(axis=0)
    neg_warn = False
    for i in np.where(colmin < 0)[0]:
        res.iloc[:, i] -= colmin[i]
        neg_warn = True
    if neg_warn:
        logger.warning(
            "Data contained negative values. Columnwise shift applied to correct this."
        )

    for t in marray_data.columns:
        col = res.loc[:, t].sort_values(ascending=False)
        cs = col.cumsum()
        s = col.sum()
        # numerical error: the final value in cumsum() may not equal the sum
        if cs[-1] != s:
            cs[cs == cs[-1]] = s
        a = 1 - cs / s

        if resolve_ties:
            # FIXME: this is tediously slow; can definitely improve it!
            # find tied values in the input data
            tied = np.unique(col.loc[col.duplicated()].values)
            if tied.size > 1:
                logger.info("Resolving %d ties in column %s.", tied.size - 1,
                            t)
                for i in tied[tied > 0]:
                    a[col == i] = a[col == i].mean()
            else:
                logger.info("No ties to resolve in column %s.", t)

        res.loc[a.index, t] = a

    # a numerical error in cumsum() may result in some small negative values. Zero these.
    res[res < 0] = 0.

    # colmin = res.min(axis=0)
    # colmin[colmin >= 0] = 0.
    # res = res.subtract(colmin, axis=1)

    return res
示例#4
0
    def __init__(
            self,
            base_dir=None,
            meta_fn=None,
            samples=None,
            tax_id=9606,
            batch_id=None,
            verbose=True,
            *args,
            **kwargs):

        """
        Base class for loading a dataset.
        :param base_dir: Path to the root input directory. All data must be contained in this directory
        or below it.
        :param meta_fn: Path to the meta file.
        :param samples: If supplied, use this to filter the files loaded.
        :param tax_id: The taxonomy ID (default: 9606, human)
        :param batch_id: Optionally supply a name for this batch, useful when combining batches
        """
        self.base_dir = base_dir
        if not os.path.isdir(self.base_dir):
            raise ValueError("Supplied base_dir %s does not exist or is not a directory." % self.base_dir)

        self.meta_fn = meta_fn
        if self.meta_fn is not None:
            if not os.path.isfile(self.meta_fn):
                raise ValueError("Meta file %s does not exist." % self.meta_fn)

        self.meta_is_linked = None
        self.sample_names = None

        self.samples_to_keep = samples
        self.tax_id = tax_id
        self.batch_id = batch_id
        self.verbose = verbose

        self.logger = log.get_console_logger(self.__class__.__name__)

        self.meta = None
        self.input_files = None
        self.data = None

        self.load_meta()

        self.get_inputs()
        self.load_data()
        self.post_process()
示例#5
0
    def __init__(
            self,
            data_fn=None,
            meta_fn=None,
            samples=None,
            tax_id=9606,
            batch_id=None,
            verbose=True,
            *args,
            **kwargs):

        """
        Base class for loading a dataset based on a single data file.
        :param data_fn: Path to the file containing all the data.
        :param meta_fn: Path to the meta file.
        :param samples: If supplied, use this to filter the files loaded.
        :param tax_id: The taxonomy ID (default: 9606, human)
        :param batch_id: Optionally supply a name for this batch, useful when combining batches
        """
        self.data_fn = data_fn
        if self.data_fn is None:
            raise ValueError("Must supply a valid path to the data file.")
        elif not os.path.isfile(self.data_fn):
            raise ValueError("Data file %s does not exist." % self.data_fn)

        self.meta_fn = meta_fn
        if self.meta_fn is not None:
            if not os.path.isfile(self.meta_fn):
                raise ValueError("Meta file %s does not exist." % self.meta_fn)


        self.meta_is_linked = None
        self.sample_names = None

        self.samples_to_keep = samples
        self.tax_id = tax_id
        self.batch_id = batch_id
        self.verbose = verbose

        self.logger = log.get_console_logger(self.__class__.__name__)

        self.meta = None
        self.input_files = data_fn
        self.data = None

        self.load_meta()
        self.load_data()
        self.post_process()
示例#6
0
    def __init__(self, clear_existing=True):
        self.logger = log.get_console_logger(self.__class__.__name__)

        # functional API - the python bindings are incomplete here?
        self.cy = CyRestClient()

        if clear_existing:
            # reset the session (in case something is already loaded)
            self.cy.session.delete()

        # command API - the python bindings are much better
        self.cy_cmd = cyrest.cyclient()

        # collections added to the session
        self.name_to_id = {}
        self.collections = {}
        self.auto_net_name = 1
示例#7
0
def check_and_get_file(root_dir,
                       ext,
                       get_func,
                       force_update=False,
                       logger=None):
    """
    Check for a file with a given extension in the supplied dir.
    :param root_dir:
    :param ext:
    :param get_fn: Function handle that, when called with no args, fetches and returns the data
    :param force_update: If True, don't use existing file but instead force a new get call.
    :return: String, giving the filename of the data
    """
    if logger is None:
        logger = log.get_console_logger("check_and_get_file")

    if not os.path.isdir(root_dir):
        logger.warn("Creating %s directory at %s.", ext, root_dir)
        os.makedirs(root_dir)
    flist = glob.glob(os.path.join(root_dir, "*.%s" % ext))
    files_seen = {}
    for full_fn in flist:
        fn = os.path.split(full_fn)[1]
        try:
            d = datetime.datetime.strptime(fn, "%s.%s" % (datestr_fmt, ext))
        except ValueError:
            logger.warn("Failed to parse version of %s file %s. Skipping.",
                        ext, full_fn)
        else:
            files_seen[d] = full_fn
    if force_update or len(files_seen) == 0:
        fn_out = os.path.join(
            root_dir,
            "%s.%s" % (datetime.date.today().strftime(datestr_fmt), ext))
        dat = get_func()
        with open(fn_out, 'wb') as fout:
            fout.write(dat)
        logger.info("Downloaded new %s file and saved it at %s", ext, fn_out)
    else:
        latest_date = max(files_seen.keys())
        logger.info("Using existing %s file %s.", ext, files_seen[latest_date])
        fn_out = files_seen[latest_date]

    return fn_out
示例#8
0
import glob
import os
import re

import pandas as pd

from load_data import loader
from rnaseq.general import ensembl_transcript_quant_to_gene
from settings import RNASEQ_DIR
from utils.log import get_console_logger

logger = get_console_logger(__name__)


INDEX_FIELDS = (
    'Approved Symbol',
    'Entrez Gene ID',
    'RefSeq IDs',
    'Ensembl Gene ID'
)

class RnaSeqFileLocations(object):
    def __init__(self, root_dir, alignment_subdir=None, batch_id = None, strandedness='r', tax_id=9606):
        self.root_dir = root_dir
        self.strandedness = strandedness
        self.alignment_subdir = alignment_subdir
        if batch_id is None:
            self.batch_id = os.path.split(self.root_dir)[-1]
        else:
            self.batch_id = batch_id
        self.tax_id = tax_id
示例#9
0
import multiprocessing as mp
import os

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.cluster import hierarchy as hc

from plotting import clustering
from rnaseq import gsea
from scripts.hgic_final import analyse_xcell_results
from scripts.hgic_final import consts
from settings import HGIC_LOCAL_DIR, GIT_LFS_DATA_DIR
from utils import log, output, reference_genomes

logger = log.get_console_logger()

XCELL_SIGNATURE_FN = os.path.join(GIT_LFS_DATA_DIR, 'xcell',
                                  'ESM3_signatures.xlsx')


def load_ipa_signatures(fn):
    res = {}
    with open(fn, 'rb') as f:
        c = csv.reader(f)
        for row in c:
            res[row[0]] = row[1:]
    return res


def simplify_tcga_names(data):
示例#10
0
import gzip
import numpy as np
import pickle
import sys
import re
import pysam
import pybedtools
import subprocess
from matplotlib import pyplot as plt
import pandas as pd

sys.path.append(os.path.dirname(__file__) + '/../../')
from settings import DATA_DIR, LOCAL_DATA_DIR, GIT_LFS_DATA_DIR
from utils import log, genomics, output

logger = log.get_console_logger(__name__)


def get_motif_locations(fa_reader, motif, references):
    for c in references:
        this_ref = fa_reader[c]
        it = re.finditer(motif, this_ref)
        for t in it:
            yield (c, t.start())


def create_cpg_bed(fa_fn, outfn, references=None):
    fa_reader = pysam.FastaFile(fa_fn)
    if references is None:
        references = fa_reader.references
    # get location of every CpG
示例#11
0
    def __init__(self, loaders, intersection_only=True):
        """
        Class to combine multiple loader objects.
        Each loader represents a separate batch. Inputs can include multiple lane loaders.
        :param loaders: Iterable of loader objects.
        :param intersection_only: If True (default), reduce counts to the indices (e.g. genes) that are present in all
        loaders.
        """
        self.logger = log.get_console_logger(self.__class__.__name__)

        if len(loaders) < 2:
            raise ValueError("Must supply 2 or more loaders to use a MultipleBatchLoader.")

        # we can only claim the meta data is linked here if all loaders have this property
        self.meta_is_linked = True
        for l in loaders:
            if not l.meta_is_linked:
                self.meta_is_linked = False

        # set the batch  column name avoiding clashes
        batch_col = 'batch'
        meta_cols = sorted(setops.reduce_union(*[t.meta.columns for t in loaders if t.meta is not None]))

        if batch_col in meta_cols:
            i = 1
            while batch_col in meta_cols:
                batch_col = "batch_%d" % i
                i += 1
        meta_cols += [batch_col]

        # check attributes that must match in all loaders
        if len(set([t.tax_id for t in loaders])) > 1:
            raise AttributeError(
                "The tax_id of the samples differ between loaders: %s" % ', '.join([str(t.tax_id) for t in loaders])
            )
        else:
            self.tax_id = loaders[0].tax_id

        if len(set([t.row_indexed for t in loaders])) > 1:
            raise AttributeError("row_indexed bool must be the same in all loaders")
        else:
            self.row_indexed = loaders[0].row_indexed

        extra_df_attributes = {}

        if self.row_indexed:
            row_indexed_dat_arr = {}
        else:
            dat = {}

        meta_values = []
        meta_index = []
        blank_meta_row = dict([(k, None) for k in meta_cols])

        # we may need to append a number to sample names
        sample_appendix = 0
        auto_batch = 1
        meta_auto_idx = 0
        samples_seen = set()

        for l in loaders:
            this_batch = l.batch_id
            if not hasattr(this_batch, '__iter__'):
                if l.batch_id is None:
                    this_batch = auto_batch
                    auto_batch += 1
                this_batch = pd.Series(this_batch, index=l.meta.index)

            try:
                this_samples = l.input_files.index.tolist()
            except AttributeError:
                # occurs when we are loading a single file
                # FIXME: find a better catch - this is too general
                if hasattr(l, 'input_files'):
                    # this occurs if l is a single file loader
                    ## FIXME: single file loaders may contain multiple samples
                    ## in that case, this doesn't spot name clashes!!

                    # FIXME: here's a workaround for now: may not be bulletproof
                    this_samples = [l.input_files]
                    if len(this_samples) != len(l.meta.index):
                        this_samples = l.meta.index.tolist()
                else:
                    # this occurs if l is a batch loader
                    # FIXME: may not give us valid sample names?
                    this_samples = l.meta.index.tolist()

            # get a copy of the data
            if self.row_indexed:
                this_dat = l.data.copy()
            else:
                this_dat = copy.copy(l.data)

            # get a copy of meta
            if l.meta is not None:
                this_meta = l.meta.copy()

            # resolve any sample clashes in the data (NOT the meta data)
            clash_resolved = False
            new_names = []

            while len(samples_seen.intersection(this_samples)) > 0:
                sample_appendix += 1
                # find the clash
                clashes = samples_seen.intersection(this_samples)
                self.logger.warning(
                    "Found sample name clash(es): %s. Modifying names to avoid errors.",
                    ', '.join(clashes)
                )
                for c in clashes:
                    new_names.append([
                        this_samples[this_samples.index(c)],
                        this_samples[this_samples.index(c)] + "_%d" % sample_appendix
                    ])
                    this_samples[this_samples.index(c)] += "_%d" % sample_appendix
                clash_resolved = True
            samples_seen.update(this_samples)

            if clash_resolved:
                # relabel metadata if linked
                if l.meta_is_linked:
                    # reorder first to be sure it's the same as data
                    this_meta = this_meta.loc[this_dat.columns]
                    this_meta.index = this_samples

                # relabel the data
                if self.row_indexed:
                    this_dat.columns = this_samples
                else:
                    for prev, new in new_names:
                        this_dat[new] = this_dat.pop(prev)

                # relabel the batch IDs
                this_batch.index = this_samples
                # relabel any other DF data if present
                for fld in l.extra_df_attributes:
                    x = getattr(l, fld)
                    x.columns = this_samples

            # data
            if self.row_indexed:
                if isinstance(this_dat.columns, pd.MultiIndex):
                    col_list = this_dat.columns.levels[0].tolist()
                else:
                    col_list = this_dat.columns.tolist()
                for c in col_list:
                    row_indexed_dat_arr[c] = this_dat[[c]]

            else:
                dat.update(this_dat)

            # other df attributes
            for fld in l.extra_df_attributes:
                if fld not in extra_df_attributes:
                    extra_df_attributes[fld] = getattr(l, fld).copy()
                else:
                    extra_df_attributes[fld] = pd.concat((extra_df_attributes[fld], getattr(l, fld)), axis=1)

            # rebuild meta
            if l.meta is not None:
                for i in this_meta.index:
                    this_row = dict(blank_meta_row)
                    this_row.update(this_meta.loc[i].to_dict())
                    this_row[batch_col] = this_batch[i]
                    meta_values.append(this_row)
                    if l.meta_is_linked:
                        meta_index.append(i)
                    else:
                        meta_index.append(meta_auto_idx)
                        meta_auto_idx += 1
            else:
                for c in this_dat.columns:
                    this_row = dict(blank_meta_row)
                    this_row[batch_col] = this_batch[c]
                    meta_values.append(this_row)
                    meta_index.append(meta_auto_idx)
                    meta_auto_idx += 1

        self.meta = pd.DataFrame(meta_values, index=meta_index, columns=meta_cols)
        if intersection_only:
            join = 'inner'
        else:
            join = 'outer'

        if self.row_indexed:
            dat = pd.concat(
                [row_indexed_dat_arr[k] for k in self.meta.index],
                axis=1, sort=True, join=join
            )

        self.data = dat
        self.batch_id = self.meta.loc[:, batch_col]

        self.extra_df_attributes = tuple()
        for fld in extra_df_attributes:
            setattr(self, fld, extra_df_attributes[fld])
            self.extra_df_attributes += (fld,)
示例#12
0
from settings import DAVID_WEB_SERVICES_CONFIG
from suds.client import Client
from utils import log

logger = log.get_console_logger('DAVID_web_services')

WSDL_URL = 'https://david-d.ncifcrf.gov/webservice/services/DAVIDWebService?wsdl'
SOAP_ENDPOINT = 'https://david-d.ncifcrf.gov/webservice/services/DAVIDWebService.DAVIDWebServiceHttpSoap11Endpoint/'


class WSDLApi(object):
    def __init__(self, url=WSDL_URL, user=DAVID_WEB_SERVICES_CONFIG['email']):
        self.user = user
        self.url = url
        self.client = None
        self.connect(endpoint=SOAP_ENDPOINT)

    def connect(self, endpoint):
        self.client = Client(self.url)
        self.client.wsdl.services[0].setlocation(endpoint)
        # authenticate user email
        self.client.service.authenticate(self.user)

    def introspection(self):
        """
        Print the service (introspection)
        :return:
        """
        print self.client
import json
import os
import gzip
import multiprocessing as mp
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from utils import log, output
from settings import INTERMEDIATE_DIR
from StringIO import StringIO
clogger = log.get_console_logger("rrbs_coverage_analysis")


def parse_one_result(cpg, perms):
    if len(cpg) == 0:
        return

    tab = pd.read_csv(StringIO(cpg), sep='\t', header=0, dtype=int)
    cpg_chr = tab.iloc[0, 0]
    cpg_start = tab.iloc[0, 1]
    cpg_arr = tab.iloc[:, -1].values

    # permutations
    n_bp = len(cpg_arr)
    n_perm = len(perms)
    perm_arr = np.zeros((n_bp, n_perm))
    perm_loc = []
    for j in range(n_perm):
        if len(perms[j]) == 0:
            # no coverage at all - skip
示例#14
0
from rnaseq import gsea
import pandas as pd
from . import consts
from utils.log import get_console_logger
import logging

logger = get_console_logger("signature_classifier")


class ssGSEAClassifier(object):
    """
    Basic classifier that uses pre-defined signatures to score samples and assess classification.
    """
    def __init__(self, signature_dict, **ssgsea_kwds):
        """
        
        :param signature_dict: Dictionary. Keys are the class name, values are iterables of genes / probes
        or any other row index
        :param ssgsea_kwds: Any additional kwargs are passed directly to the ssgsea algorithm.
        """
        self.signatures = signature_dict
        self.ssgsea_kwds = ssgsea_kwds
        # it's useful to maintain a copy of all the signature IDs for validation purposes
        self.all_ids = reduce(lambda x, y: x.union(y),
                              self.signatures.values(), set())

    def score(self, sample_data, check_overlap=True):
        """
        
        :param sample_data: Pandas Series (single sample) or DataFrame (multiple samples) to be classified.
        :param check_overlap: If True (default), check whether all the signature IDs are in the data. If
import requests
import pandas as pd
import os
from utils import output, log
from load_data import geo_repo, sra
import ftplib

from settings import RNASEQ_DIR
"""
AIM: retrieve metadata (combined SRA and GEO) for a number of GEO datasets in one go
"""

GEO_FTP = "ftp.ncbi.nlm.nih.gov"
GEO_BASE = "/geo/series/{stripped_id}/{full_id}/matrix/{full_id}_series_matrix.txt.gz"

logger = log.get_console_logger("sra_batch_get_metadata")

inputs = [
    ('GSE116124', 'SRP151040'),
    ('GSE97265', 'SRP102810'),
    ('GSE89056', 'SRP091957'),
    ('GSE107654', 'SRP126289'),
    ('GSE97904', 'SRP104149'),
    ('GSE97619', 'SRP103788'),
    ('GSE85839', 'SRP082406'),
    ('GSE53094', 'SRP033569'),
    ('GSE67915', 'SRP057205'),
    # ('GSE62772', 'SRP049340'),  # need to dl two geo files
    ('GSE73211', 'SRP063867'),
]
示例#16
0
import pandas as pd
import os
import re
import collections
from rnaseq import gsea
from settings import OUTPUT_DIR
from utils import log, setops, output, excel
from plotting import venn
from matplotlib import pyplot as plt

logger = log.get_console_logger("process_GSEA_results")

if __name__ == "__main__":
    """
    Use the results generated by exporting data with prepare_data then running GSEA (see code comments in prepare_data).
    """
    pids = [
        '018', '019', '030', '031', '017', '050', '054', '061', '026', '052'
    ]
    refs = ['gibco_nsc', 'h9_nsc']
    top_n_pathways = 20
    units = 'tpm'
    fdr = 0.05  # set to None to skip filtering
    indir = os.path.join(OUTPUT_DIR, "gsea_data", units)
    outdir = output.unique_output_dir("gsea_data")

    subgroups = {
        'RTK I': ['018', '019', '030', '031'],
        'RTK II': ['017', '050', '054', '061'],
        'MES': ['026', '052']
    }