Пример #1
0
import CGAT.Pipeline as P

P.getParameters(["%s.ini" % os.path.splitext(__file__)[0], "pipeline.ini"])
PARAMS = P.PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

#TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" )
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    [x.replace("../", "")
     for x in glob.glob("*.export.txt.gz") if PARAMS["tracks_control"] not in x],
    "(\S+).export.txt.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.sra") if PARAMS["tracks_control"] not in x],
        "(\S+).sra" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.fastq.gz") if PARAMS["tracks_control"] not in x],
        "(\S+).fastq.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
         for x in glob.glob("*.fastq.1.gz") if PARAMS["tracks_control"] not in x],
        "(\S+).fastq.1.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        [x.replace("../", "")
Пример #2
0
###################################################################
###################################################################
## parameterization

EXPORTDIR = P.get('mapping_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('mapping_datadir', P.get('datadir', '.'))
DATABASE = P.get('mapping_backend', P.get('sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "%s/*.sra" % DATADIR), "%s/(\S+).sra" % DATADIR) +\
    PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "%s/*.fastq.gz" % DATADIR), "%s/(\S+).fastq.gz" % DATADIR ) +\
    PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "%s/*.fastq.1.gz" % DATADIR), "%s/(\S+).fastq.1.gz" % DATADIR ) +\
    PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory(
    glob.glob( "*.csfasta.gz" ), "(\S+).csfasta.gz" )


###########################################################################
## tracks for the gene sets
class GenesetTrack(PipelineTracks.Sample):
    attributes = ("geneset", )


GENESET_TRACKS = PipelineTracks.Tracks(GenesetTrack).loadFromDirectory(
Пример #3
0
PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py",
                                      on_error_raise=__name__ == "__main__")

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample

TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.bed.gz"), "(\S+).bed.gz")

TRACKS_BEDFILES = ["%s.bed.gz" % x for x in TRACKS]

###################################################################
###################################################################
###################################################################
# if conf.py exists: execute to change the above assignmentsn
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    execfile("pipeline_conf.py")

###################################################################
###################################################################
###################################################################
#
# parameterization

EXPORTDIR = P.get('rnaseqdiffexpression_exportdir',
                  P.get('exportdir', 'export'))
DATADIR = P.get('rnaseqdiffexpression_datadir', P.get('datadir', '.'))
DATABASE = P.get('rnaseqdiffexpression_backend',
                 P.get('sql_backend', 'sqlite:///./csvdb'))

DATABASE_ANNOTATIONS = P['annotations_database']

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("%s/*.bam" % DATADIR), "(\S+).bam")

ALL = PipelineTracks.Aggregate(TRACKS)
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", ))

GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz")

DESIGNS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("design*.tsv"), "(\S+).tsv")

METHODS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*_stats.tsv"), "(\S+)_stats.tsv")
Пример #5
0
import CGAT.Pipeline as Pipeline
PARAMS_PIPELINE = Pipeline.peekParameters( ".",
                                           "pipeline_chipseq.py" )

import CGATPipelines.PipelineTracks as PipelineTracks

Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz",
            "sra",
            "fastq.gz",
            "fastq.1.gz",
            "csfasta.gz" ]

TRACKS = sum( itertools.chain( [ PipelineTracks.Tracks( Sample ).loadFromDirectory( 
        [ x for x in glob.glob( "%s/*.%s" % (DATADIR, s) ) if "input" not in x ],
        "%s/(\S+).%s" % (DATADIR, s) ) for s in suffixes ] ), 
              PipelineTracks.Tracks( Sample ) )

Sample.setDefault( "asTable" )

ALL = PipelineTracks.Aggregate( TRACKS )
EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) )
CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) )
TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) )

############################################################################
# The folllowing need to be parameterized in a config file
# TISSUES=["GM00855", "GM00861" ]
# CONDITIONS=["D3", "unstim" ]
# REPLICATES=["R1", "R2" ]
Пример #6
0
# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(P.peekParameters(
    PARAMS["annotations_dir"],
    "pipeline_annotations.py",
    on_error_raise=__name__ == "__main__",
    prefix="annotations_",
    update_interface=True))

# define some tracks if needed

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.ini"), "(\S+).ini")


# --------------------------< utility functions >---------------------------- #

def connect():
    '''Connect to database.
       Use this method to connect to additional databases.
       Returns an sqlite3 database handle.
    '''

    dbh = sqlite3.connect(PARAMS["database"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
Пример #7
0
###################################################################
###################################################################
## parameterization

EXPORTDIR=P['rnaseqtranscripts_exportdir']
DATADIR=P['rnaseqtranscripts_datadir']
DATABASE=P['rnaseqtranscripts_backend']

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( 
    glob.glob( "%s/*.bam" % DATADIR), "%s/(\S+).bam" % DATADIR)

ALL = PipelineTracks.Aggregate( TRACKS )
EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) )
CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) )
TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) )

GENESETS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( 
    glob.glob( "*.gtf.gz" ), "(\S+).gtf.gz" )

###########################################################################

CUFFDIFF_LEVELS= ("gene", "isoform", "cds", "tss")

###########################################################################
## shorthand
Пример #8
0
###################################################

# load options from the config file
import CGAT.Pipeline as P
P.getParameters("pipeline.ini")

PARAMS = P.PARAMS

###################################################################
###################################################################
## Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

# collect fastq.gz tracks
TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
        glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\
        PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
            glob.glob( "*.fastq.1.gz" ), "(\S+).fastq.1.gz" )

ALL = PipelineTracks.Sample3()
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", ))

###################################################################
## Global flags
###################################################################
ASSEMBLERS = P.asList(PARAMS["general_assemblers"])
METAGENOME = "meta-velvet" in ASSEMBLERS or "ibda" in ASSEMBLERS or "cortex_var" in ASSEMBLERS

ASSEMBLERS = P.asList(PARAMS["assemblers"])
Пример #9
0
###################################################
# Pipeline configuration
# load options from the config file
from CGATCore import Pipeline as P

P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

###################################################################
# Helper functions mapping tracks to conditions, etc
GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz")
TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3)
TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam")
REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", ))
TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))


def connect():
    '''connect to database.

    Use this method to connect to additional databases.

    Returns a database connection.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
Пример #10
0
PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

##########################################################################
##########################################################################
# Helper functions mapping tracks to conditions, etc
##########################################################################

import CGATPipelines.PipelineTracks as PipelineTracks

Sample = PipelineTracks.AutoSample

# define tracks based on all samples in .bamfile that are not input or index
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob(os.path.join(PARAMS["location_bamfiles"], "*.bam")),
    "(\S+).bam",
    exclude=[".+input.+"])


@files(None, None)
def printTracks(infile, outfile):
    P.warn("\n\n\n\nprinting tracks:")
    for track in EXPERIMENTS:
        print "\t"
        print track


def get_peak_caller_parameters(peak_caller_id):
    """
    Returns a dictionary of config file parameters for the chosen peak caller
    (an attempt to keep access to PARAMS out of associated pipeline script).
Пример #11
0
                    "*.sra",
                    "*.export.txt.gz",
                    "*.csfasta.gz",
                    "*.csfasta.F3.gz",
                    )

SEQUENCEFILES = tuple([os.path.join(DATADIR, suffix_name)
                       for suffix_name in SEQUENCESUFFIXES])

SEQUENCEFILES_REGEX = regex(
    r"(\S+)-(\S+)-(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra)")

Sample = PipelineTracks.AutoSample
Sample.attributes = ('tissue', 'condition', 'replicate')
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    [y for x in SEQUENCESUFFIXES for y in glob.glob(x)],
    "(\S+).(fastq.1.gz|fastq.gz|sra)")

EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("tissue", "condition"))
CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", ))
REPLICATES = PipelineTracks.Aggregate(TRACKS, labels=("replicate", ))

#########################################################################
# summarise read 3'
#########################################################################


@follows(mkdir("sequence_characteristics.dir"))
@transform(SEQUENCEFILES,
           SEQUENCEFILES_REGEX,
           r"sequence_characteristics.dir/\1-\2-\3.\g<suffix>_start.tsv")
Пример #12
0
])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

Sample = PipelineTracks.AutoSample

# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.bam"), "(\S+).bam")

# group by experiment (assume that last field is a replicate identifier)
EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))

GENESETS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz")

###################################################################
###################################################################
###################################################################


def connect():
    '''connect to database.
Пример #13
0
## Pipeline configuration
import CGAT.Pipeline as P
P.getParameters("pipeline_capseq.ini")
PARAMS = P.PARAMS
USECLUSTER = True

###################################################################
###################################################################
###################################################################
## Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

TRACKS = PipelineTracks.Tracks( Sample ).loadFromDirectory(
    [ x for x in glob.glob( "*.export.txt.gz" ) if PARAMS["tracks_control"] not in x ],
      "(\S+).export.txt.gz" ) +\
      PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
          [ x for x in glob.glob( "*.sra" ) if PARAMS["tracks_control"] not in x ],
          "(\S+).sra" ) +\
          PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
              [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x],
              "(\S+).fastq.gz" ) +\
              PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
                  [x for x in glob.glob( "*.fastq.1.gz" ) if PARAMS["tracks_control"] not in x],
                  "(\S+).fastq.1.gz" ) +\
                  PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory(
                      [ x for x in glob.glob( "*.csfasta.gz" ) if PARAMS["track_control"] not in x],
                        "(\S+).csfasta.gz" )
for X in TRACKS:
    print "TRACK=", X, "\n"
Пример #14
0
PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

PipelineiCLIP.PARAMS = PARAMS
PipelineiCLIP.PARAMS_ANNOTATIONS = PARAMS_ANNOTATIONS
PARAMS["project_src"] = os.path.join(os.path.dirname(__file__), "..")

###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

# define some tracks if needed
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3)
for line in IOTools.openFile("sample_table.tsv"):
    track = line.split("\t")[2]
    TRACKS.tracks.append(PipelineTracks.Sample3(filename=track))


###################################################################
def connect():
    '''connect to database.

    Use this method to connect to additional databases.

    Returns a database connection.
    '''

    dbh = sqlite3.connect(PARAMS["database"])
Пример #15
0
                                      "pipeline_annotations.py",
                                      on_error_raise=__name__ == "__main__")

# link up with ancestral repeats
PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"],
                                            "pipeline_ancestral_repeats.py")

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

# collect sra nd fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"),
    "(\S+).gtf.gz",
    exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz"))

TRACKS_CONTROL = PipelineTracks.Tracks(
    PipelineTracks.Sample).loadFromDirectory(
        ("repeats.gtf.gz", "introns.gtf.gz"), "(\S+).gtf.gz")

TRACKS_META = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    ("merged.gtf.gz", ), "(\S+).gtf.gz")

TRACKS_GENESETS = PipelineTracks.Tracks(
    PipelineTracks.Sample).loadFromDirectory(("genes.gtf.gz", ),
                                             "(\S+).gtf.gz")

# collection of all tracks including controls
TRACKS_WITH_CONTROLS = TRACKS + TRACKS_CONTROL
Пример #16
0
###################################################################
###################################################################
###################################################################
##
###################################################################
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec'))

PARAMS = P.getParameters()

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=(".mapped.gtf.gz", ))

#####################################################################
#####################################################################
#####################################################################


@transform(TRACKS.getTracks("%s.gtf.gz"), suffix(".gtf.gz"), '.psl.gz')
def convertGtf2Psl(infile, outfile):
    """convert a gtf to a psl file.

    This method only takes features of type 'exon' and
    skips all contigs that are not in the genome sequence
    (for example the variant human chromosomes).
    """
Пример #17
0
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file):
    '''Generate a .fasta file of adaptor sequences that are
    overrepresented in the reads from a sample.

    Requires cutadapt >= 1.7.

    Arguments
    ---------
    infile : string
        Input filename that has been QC'ed. The filename is used to
        check if the input was a :term:`sra` file and guess the
        number of tracks to check.
    outfile : string
        Output filename in :term:`fasta` format.
    track : string
        Track name, used to access FastQC results in database.
    dbh : object
        Database handle.
    contaminants_file : string
        Path of file containing contaminants used for screening by
        Fastqc.

    '''
    tracks = [track]

    if infile.endswith(".sra"):
        # patch for SRA files, look at multiple tracks
        f, fastq_format, datatype = Sra.peek(infile)
        if len(f) == 2:
            tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.1.gz"):
        tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.gz"):
        tracks = [track]

    found_contaminants = []

    for t in tracks:
        table = PipelineTracks.AutoSample(os.path.basename(t)).asTable()

        # if sample name starts with a number, sql table will have
        # prepended "_"
        if re.match("^\d+.*", table):
            table = "_" + table

        query = '''SELECT Possible_Source, Sequence FROM
        %s_fastqc_Overrepresented_sequences;''' % table

        cc = dbh.cursor()

        # if there is no contamination table for even a single sample
        # it will prevent the whole pipeline progressing
        try:
            found_contaminants.extend(cc.execute(query).fetchall())
        except sqlite3.OperationalError:
            E.warn("No table found for {}".format(t))

    if len(found_contaminants) == 0:
        P.touch(outfile)
        return

    # read contaminants from existing file
    with IOTools.openFile(contaminants_file, "r") as inf:
        known_contaminants = [l.split() for l in inf
                              if not l.startswith("#") and l.strip()]
        known_contaminants = {" ".join(x[:-1]): x[-1]
                              for x in known_contaminants}

    # output the full sequence of the contaminant if found
    # in the list of known contaminants, otherwise don't report!

    matched_contaminants = set()
    with IOTools.openFile(outfile, "w") as outf:
        for found_source, found_seq in found_contaminants:
            possible_source = found_source.split(" (")[0]

            if possible_source in known_contaminants:
                matched_contaminants.update((possible_source,))
            else:
                pass

        if len(matched_contaminants) > 0:
            for match in matched_contaminants:
                outf.write(">%s\n%s\n" % (match.replace(" ,", ""),
                                          known_contaminants[match]))
Пример #18
0
PipelineMedip.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
Sample = PipelineTracks.Sample3

suffixes = ["export.txt.gz", "sra", "fastq.gz", "cfastq.1.gz", "csfasta.gz"]

TRACKS = sum(
    itertools.chain([
        PipelineTracks.Tracks(Sample).loadFromDirectory([
            x
            for x in glob.glob("*.%s" % s) if PARAMS["tracks_control"] not in x
        ], "(\S+).%s" % s) for s in suffixes
    ]), PipelineTracks.Tracks(Sample))

###################################################################
###################################################################
###################################################################
# if conf.py exists: execute to change the above assignmentsn
if os.path.exists("pipeline_conf.py"):
    L.info("reading additional configuration from pipeline_conf.py")
    exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec'))

###################################################################
###################################################################
###################################################################
# define aggregates
Пример #19
0
from CGATCore import Pipeline as P
import CGATPipelines.PipelineTracks as PipelineTracks

# load options from the config file
P.getParameters([
    "%s/pipeline.ini" % __file__[:-len(".py")], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

Sample = PipelineTracks.Sample
TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("medip_*"), "medip_(\S+)")


def connect():
    '''connect to database.

    This method also attaches to helper databases.
    '''

    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()
Пример #20
0
PipelineMotifs.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
# load all tracks - exclude input/control tracks
# determine the location of the input files (reads).
DATADIR = PARAMS.get('input', '.')
if not os.path.exists(DATADIR):
    raise OSError('data directory %s does not exists')

Sample = PipelineTracks.Sample

TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob(os.path.join(DATADIR, "*.bed.gz")), "(\S+).bed.gz")

BEDFILES = [os.path.join(DATADIR, "%s.bed.gz") % x for x in TRACKS]


# create an indicator target
@transform(BEDFILES, suffix(".gz"), ".gz")
def BedFiles(infile, outfile):
    pass


BAMFILES = glob.glob(os.path.join(DATADIR, "*.bam"))


def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.
Пример #21
0
###################################################################
###################################################################
# parameterization

EXPORTDIR = P.get('readqc_exportdir', P.get('exportdir', 'export'))
DATADIR = P.get('readqc_datadir', P.get('datadir', '.'))
DATABASE = P.get('readqc_backend', P.get('sql_backend', 'sqlite:///./csvdb'))

###################################################################
# cf. pipeline_rnaseq.py
# This should be automatically gleaned from pipeline_rnaseq.py
###################################################################
import CGATPipelines.PipelineTracks as PipelineTracks

TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("%s/*.sra" % DATADIR), "(\S+).sra") +\
    PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
        glob.glob("%s/*.fastq.gz" % DATADIR), "(\S+).fastq.gz") +\
    PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
        glob.glob("%s/*.fastq.1.gz" % DATADIR), "(\S+).fastq.1.gz") +\
    PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
        glob.glob("*.csfasta.gz"), "(\S+).csfasta.gz")

###########################################################################


class ReadqcTracker(TrackerSQL):
    '''Define convenience tracks for plots'''
    def __init__(self, *args, **kwargs):
        TrackerSQL.__init__(self, *args, backend=DATABASE, **kwargs)
Пример #22
0
    dbh = sqlite3.connect(PARAMS["database_name"])
    statement = '''ATTACH DATABASE '%s' as annotations''' % (
        PARAMS["annotations_database"])
    cc = dbh.cursor()
    cc.execute(statement)
    cc.close()

    return dbh


class MySample(PipelineTracks.Sample):
    attributes = tuple(PARAMS["attributes"].split(","))


TRACKS = PipelineTracks.Tracks(MySample).loadFromDirectory(
    glob.glob("*.bam"), "(\S+).bam")

Sample = PipelineTracks.AutoSample
DESIGNS = PipelineTracks.Tracks(Sample).loadFromDirectory(
    glob.glob("*.design.tsv"), "(\S+).design.tsv")

###################################################################
###################################################################
###################################################################
# DEXSeq workflow
###################################################################


@mkdir("results.dir")
@files(PARAMS["annotations_interface_geneset_all_gtf"], "geneset_flat.gff")
def buildGff(infile, outfile):