Exemplo n.º 1
0
    def __call__(self, track, slice=None):

        exp_statement = """
        SELECT TPM, gene_id, sample_name
        FROM sailfish_genes AS A
        JOIN samples AS B
        ON A.sample_id = B.id"""

        exp_df = self.getDataFrame(exp_statement)

        factors_statement = '''
        SELECT factor, factor_value, sample_name
        FROM samples AS A
        JOIN factors AS B
        ON A.id = B.sample_id
        WHERE factor != 'genome'
        '''

        factors_df = self.getDataFrame(factors_statement)

        merged_df = pd.merge(exp_df,
                             factors_df,
                             left_on="sample_name",
                             right_on="sample_name")

        genes = Pipeline.asList(
            Pipeline.peekParameters(
                ".", "pipeline_rnaseqqc.py")['genes_of_interest'])

        interest_df = merged_df[merged_df['gene_id'].isin(genes)]

        interest_df['TPM'] = interest_df['TPM'].astype(float)

        return interest_df.reset_index().set_index("factor")
Exemplo n.º 2
0

# load options from the config file
PARAMS = P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    on_error_raise=__name__ == "__main__",
    prefix="annotations_",
    update_interface=True))


GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz")

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3)
TRACKS = TRACKS.loadFromDirectory(glob.glob("*.bam"),
                                  "(\S+).bam")
REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate"))
CONDITION = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))

# if necessary, update the PARAMS dictionary in any modules file.
Exemplo n.º 3
0
import pysam

# load options from the config file
PARAMS = P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    on_error_raise=__name__ == "__main__",
    prefix="annotations_",
    update_interface=True))

# define some tracks if needed

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.ini"), "(\S+).ini")


# --------------------------< utility functions >---------------------------- #

def connect():
    '''Connect to database.
       Use this method to connect to additional databases.
       Returns an sqlite3 database handle.
import CGAT.Stats as Stats
import pysam
import CGATPipelines.PipelineTracks as PipelineTracks

###################################################################
###################################################################
###################################################################
# Pipeline configuration
import CGATPipelines.Pipeline as P
P.getParameters(["%s/pipeline.ini" %
                 os.path.splitext(__file__)[0], "../pipeline.ini",
                 "pipeline.ini"])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

SEPARATOR = "|"

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc


class TracksVCF (PipelineTracks.Tracks):

    def load(self, filename, exclude=None):
        '''load tracks from a vcf file.'''
        tracks = []
import CGAT.Stats as Stats
import pysam
import CGATPipelines.PipelineTracks as PipelineTracks

###################################################################
###################################################################
###################################################################
# Pipeline configuration
import CGATPipelines.Pipeline as P
P.getParameters(["%s/pipeline.ini" %
                 os.path.splitext(__file__)[0], "../pipeline.ini",
                 "pipeline.ini"])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

SEPARATOR = "|"

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc


class TracksVCF (PipelineTracks.Tracks):

    def load(self, filename, exclude=None):
        '''load tracks from a vcf file.'''
        tracks = []
Exemplo n.º 6
0
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"],
    defaults={"annotations_dir": "",
              "genesets_abinitio_coding": "pruned.gtf.gz",
              "genesets_abinitio_lncrna": "pruned.gtf.gz",
              "genesets_reference": "reference.gtf.gz",
              "genesets_refcoding": "refcoding.gtf.gz",
              "genesets_previous": ""})

PARAMS = P.PARAMS

PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    prefix="annotations_",
    update_interface=True))

PREVIOUS = P.asList(PARAMS["genesets_previous"])


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
Exemplo n.º 7
0
###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
],
                defaults={'paired_end': False})

PARAMS = P.PARAMS

PARAMS.update(
    P.peekParameters(PARAMS["annotations_dir"],
                     "pipeline_annotations.py",
                     prefix="annotations_",
                     update_interface=True))

PipelinePeakcalling.PARAMS = PARAMS
PipelineMotifs.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
# determine the location of the input files (reads).
DATADIR = PARAMS.get('input', '.')
if not os.path.exists(DATADIR):
    raise OSError('data directory %s does not exists')
Exemplo n.º 8
0
import CGATPipelines.Pipeline as Pipeline
import CGATPipelines.PipelineTracks as PipelineTracks
###################################################################
###################################################################
# parameterization

EXPORTDIR = P.get('calling_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('calling_datadir', P.get('datadir', '.'))
DATABASE = P.get('calling_backend',
                 P.get('report_sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_chipseq.py
# This should be automatically gleaned from pipeline_chipseq.py
###################################################################
PARAMS_PIPELINE = Pipeline.peekParameters(".", "pipeline_chipseq.py")

Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz"]

TRACKS = sum(
    itertools.chain([
        PipelineTracks.Tracks(Sample).loadFromDirectory([
            x for x in glob.glob("%s/*.%s" % (DATADIR, s)) if "input" not in x
        ], "%s/(\S+).%s" % (DATADIR, s)) for s in suffixes
    ]), PipelineTracks.Tracks(Sample))

Sample.setDefault("asTable")

ALL = PipelineTracks.Aggregate(TRACKS)
Exemplo n.º 9
0
from CGATReport.Utils import PARAMS as P
import CGATPipelines.Pipeline as Pipeline
import CGATPipelines.PipelineTracks as PipelineTracks
###################################################################
###################################################################
# parameterization

EXPORTDIR = P.get('calling_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('calling_datadir', P.get('datadir', '.'))
DATABASE = P.get('calling_backend', P.get('report_sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_chipseq.py
# This should be automatically gleaned from pipeline_chipseq.py
###################################################################
PARAMS_PIPELINE = Pipeline.peekParameters(".",
                                          "pipeline_chipseq.py")


Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz",
            "sra",
            "fastq.gz",
            "fastq.1.gz",
            "csfasta.gz"]

TRACKS = sum(itertools.chain([PipelineTracks.Tracks(Sample).loadFromDirectory(
    [x for x in glob.glob("%s/*.%s" % (DATADIR, s)) if "input" not in x],
    "%s/(\S+).%s" % (DATADIR, s)) for s in suffixes]),
    PipelineTracks.Tracks(Sample))
Exemplo n.º 10
0
import CGAT.Database as Database
import CGAT.GTF as GTF
import CGATPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

USECLUSTER = True

# link up with annotations
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"],
                                            "pipeline_ancestral_repeats.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz",
    exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))
import CGAT.GTF as GTF
import CGATPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

PARAMS = P.PARAMS

USECLUSTER = True

# link up with annotations
PARAMS_ANNOTATIONS = P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(
    PARAMS["ancestral_repeats_dir"],
    "pipeline_ancestral_repeats.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))
Exemplo n.º 12
0
###################################################################
###################################################################
# Load options and annotations
###################################################################

# load options from the config file
PARAMS = P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

# add configuration values from associated pipelines
PARAMS = P.PARAMS
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_genesets.py",
    prefix="annotations_",
    update_interface=True,
    restrict_interface=True))  # add config values from associated pipelines

# The DEXSeq R directory contains important python helper functions
PYTHONSCRIPTSDIR = R('''
    f = function(){
    pythonScriptsDir = system.file("python_scripts", package="DEXSeq")
    }
    f()''').tostring()


###################################################################
###################################################################
###################################################################
# Utility functions
Exemplo n.º 13
0
# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

# Add parameters from the gtf_subset pipeline, but
# only the interface section. All PARAMS options
# will have the prefix `annotations_`

PARAMS.update(
    P.peekParameters(PARAMS["gtf_dir"],
                     "pipeline_genesets.py",
                     prefix="annotations_",
                     update_interface=True,
                     restrict_interface=True))

# -----------------------------------------------
# Utility functions


def connect():
    '''utility function to connect to database.

    Use this method to connect to the pipeline database.
    Additional databases can be attached here as well.

    Returns an sqlite3 database handle.
    '''
Exemplo n.º 14
0
# load options from the config file
P.getParameters(
    ["%s/pipeline.ini" % os.path.splitext(__file__)[0],
     "../pipeline.ini",
     "pipeline.ini"])

PARAMS = P.PARAMS

# Add parameters from the gtf_subset pipeline, but
# only the interface section. All PARAMS options
# will have the prefix `annotations_`

PARAMS.update(P.peekParameters(
    PARAMS["gtf_dir"],
    "pipeline_genesets.py",
    prefix="annotations_",
    update_interface=True,
    restrict_interface=True))

# -----------------------------------------------
# Utility functions


def connect():
    '''utility function to connect to database.

    Use this method to connect to the pipeline database.
    Additional databases can be attached here as well.

    Returns an sqlite3 database handle.
    '''