import CGAT.Pipeline as P P.getParameters(["%s.ini" % os.path.splitext(__file__)[0], "pipeline.ini"]) PARAMS = P.PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 #TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" ) TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.export.txt.gz") if PARAMS["tracks_control"] not in x], "(\S+).export.txt.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.sra") if PARAMS["tracks_control"] not in x], "(\S+).sra" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.fastq.gz") if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.fastq.1.gz") if PARAMS["tracks_control"] not in x], "(\S+).fastq.1.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "")
################################################################### ################################################################### ## parameterization EXPORTDIR = P.get('mapping_exportdir', P.get('exportdir', 'export')) DATADIR = P.get('mapping_datadir', P.get('datadir', '.')) DATABASE = P.get('mapping_backend', P.get('sql_backend', 'sqlite:///./csvdb')) ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "%s/*.sra" % DATADIR), "%s/(\S+).sra" % DATADIR) +\ PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "%s/*.fastq.gz" % DATADIR), "%s/(\S+).fastq.gz" % DATADIR ) +\ PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "%s/*.fastq.1.gz" % DATADIR), "%s/(\S+).fastq.1.gz" % DATADIR ) +\ PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "*.csfasta.gz" ), "(\S+).csfasta.gz" ) ########################################################################### ## tracks for the gene sets class GenesetTrack(PipelineTracks.Sample): attributes = ("geneset", ) GENESET_TRACKS = PipelineTracks.Tracks(GenesetTrack).loadFromDirectory(
PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__") ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.bed.gz"), "(\S+).bed.gz") TRACKS_BEDFILES = ["%s.bed.gz" % x for x in TRACKS] ################################################################### ################################################################### ################################################################### # if conf.py exists: execute to change the above assignmentsn if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") execfile("pipeline_conf.py") ################################################################### ################################################################### ################################################################### #
# parameterization EXPORTDIR = P.get('rnaseqdiffexpression_exportdir', P.get('exportdir', 'export')) DATADIR = P.get('rnaseqdiffexpression_datadir', P.get('datadir', '.')) DATABASE = P.get('rnaseqdiffexpression_backend', P.get('sql_backend', 'sqlite:///./csvdb')) DATABASE_ANNOTATIONS = P['annotations_database'] ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.bam" % DATADIR), "(\S+).bam") ALL = PipelineTracks.Aggregate(TRACKS) EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", )) GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") DESIGNS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("design*.tsv"), "(\S+).tsv") METHODS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*_stats.tsv"), "(\S+)_stats.tsv")
import CGAT.Pipeline as Pipeline PARAMS_PIPELINE = Pipeline.peekParameters( ".", "pipeline_chipseq.py" ) import CGATPipelines.PipelineTracks as PipelineTracks Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz" ] TRACKS = sum( itertools.chain( [ PipelineTracks.Tracks( Sample ).loadFromDirectory( [ x for x in glob.glob( "%s/*.%s" % (DATADIR, s) ) if "input" not in x ], "%s/(\S+).%s" % (DATADIR, s) ) for s in suffixes ] ), PipelineTracks.Tracks( Sample ) ) Sample.setDefault( "asTable" ) ALL = PipelineTracks.Aggregate( TRACKS ) EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) ) CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) ) TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) ) ############################################################################ # The folllowing need to be parameterized in a config file # TISSUES=["GM00855", "GM00861" ] # CONDITIONS=["D3", "unstim" ] # REPLICATES=["R1", "R2" ]
# add configuration values from associated pipelines # # 1. pipeline_annotations: any parameters will be added with the # prefix "annotations_". The interface will be updated with # "annotations_dir" to point to the absolute path names. PARAMS.update(P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py", on_error_raise=__name__ == "__main__", prefix="annotations_", update_interface=True)) # define some tracks if needed TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.ini"), "(\S+).ini") # --------------------------< utility functions >---------------------------- # def connect(): '''Connect to database. Use this method to connect to additional databases. Returns an sqlite3 database handle. ''' dbh = sqlite3.connect(PARAMS["database"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement)
################################################################### ################################################################### ## parameterization EXPORTDIR=P['rnaseqtranscripts_exportdir'] DATADIR=P['rnaseqtranscripts_datadir'] DATABASE=P['rnaseqtranscripts_backend'] ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "%s/*.bam" % DATADIR), "%s/(\S+).bam" % DATADIR) ALL = PipelineTracks.Aggregate( TRACKS ) EXPERIMENTS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", "tissue" ) ) CONDITIONS = PipelineTracks.Aggregate( TRACKS, labels = ("condition", ) ) TISSUES = PipelineTracks.Aggregate( TRACKS, labels = ("tissue", ) ) GENESETS = PipelineTracks.Tracks( PipelineTracks.Sample ).loadFromDirectory( glob.glob( "*.gtf.gz" ), "(\S+).gtf.gz" ) ########################################################################### CUFFDIFF_LEVELS= ("gene", "isoform", "cds", "tss") ########################################################################### ## shorthand
################################################### # load options from the config file import CGAT.Pipeline as P P.getParameters("pipeline.ini") PARAMS = P.PARAMS ################################################################### ################################################################### ## Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks # collect fastq.gz tracks TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( glob.glob( "*.fastq.1.gz" ), "(\S+).fastq.1.gz" ) ALL = PipelineTracks.Sample3() EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", )) ################################################################### ## Global flags ################################################################### ASSEMBLERS = P.asList(PARAMS["general_assemblers"]) METAGENOME = "meta-velvet" in ASSEMBLERS or "ibda" in ASSEMBLERS or "cortex_var" in ASSEMBLERS ASSEMBLERS = P.asList(PARAMS["assemblers"])
################################################### # Pipeline configuration # load options from the config file from CGATCore import Pipeline as P P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS ################################################################### # Helper functions mapping tracks to conditions, etc GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3) TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam") REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", )) TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) def connect(): '''connect to database. Use this method to connect to additional databases. Returns a database connection. ''' dbh = sqlite3.connect(PARAMS["database_name"])
PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") ########################################################################## ########################################################################## # Helper functions mapping tracks to conditions, etc ########################################################################## import CGATPipelines.PipelineTracks as PipelineTracks Sample = PipelineTracks.AutoSample # define tracks based on all samples in .bamfile that are not input or index TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob(os.path.join(PARAMS["location_bamfiles"], "*.bam")), "(\S+).bam", exclude=[".+input.+"]) @files(None, None) def printTracks(infile, outfile): P.warn("\n\n\n\nprinting tracks:") for track in EXPERIMENTS: print "\t" print track def get_peak_caller_parameters(peak_caller_id): """ Returns a dictionary of config file parameters for the chosen peak caller (an attempt to keep access to PARAMS out of associated pipeline script).
"*.sra", "*.export.txt.gz", "*.csfasta.gz", "*.csfasta.F3.gz", ) SEQUENCEFILES = tuple([os.path.join(DATADIR, suffix_name) for suffix_name in SEQUENCESUFFIXES]) SEQUENCEFILES_REGEX = regex( r"(\S+)-(\S+)-(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra)") Sample = PipelineTracks.AutoSample Sample.attributes = ('tissue', 'condition', 'replicate') TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( [y for x in SEQUENCESUFFIXES for y in glob.glob(x)], "(\S+).(fastq.1.gz|fastq.gz|sra)") EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("tissue", "condition")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) REPLICATES = PipelineTracks.Aggregate(TRACKS, labels=("replicate", )) ######################################################################### # summarise read 3' ######################################################################### @follows(mkdir("sequence_characteristics.dir")) @transform(SEQUENCEFILES, SEQUENCEFILES_REGEX, r"sequence_characteristics.dir/\1-\2-\3.\g<suffix>_start.tsv")
]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks Sample = PipelineTracks.AutoSample # collect sra nd fastq.gz tracks TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.bam"), "(\S+).bam") # group by experiment (assume that last field is a replicate identifier) EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) GENESETS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") ################################################################### ################################################################### ################################################################### def connect(): '''connect to database.
## Pipeline configuration import CGAT.Pipeline as P P.getParameters("pipeline_capseq.ini") PARAMS = P.PARAMS USECLUSTER = True ################################################################### ################################################################### ################################################################### ## Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 TRACKS = PipelineTracks.Tracks( Sample ).loadFromDirectory( [ x for x in glob.glob( "*.export.txt.gz" ) if PARAMS["tracks_control"] not in x ], "(\S+).export.txt.gz" ) +\ PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [ x for x in glob.glob( "*.sra" ) if PARAMS["tracks_control"] not in x ], "(\S+).sra" ) +\ PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.1.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.1.gz" ) +\ PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [ x for x in glob.glob( "*.csfasta.gz" ) if PARAMS["track_control"] not in x], "(\S+).csfasta.gz" ) for X in TRACKS: print "TRACK=", X, "\n"
PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") PipelineiCLIP.PARAMS = PARAMS PipelineiCLIP.PARAMS_ANNOTATIONS = PARAMS_ANNOTATIONS PARAMS["project_src"] = os.path.join(os.path.dirname(__file__), "..") ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks # define some tracks if needed TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3) for line in IOTools.openFile("sample_table.tsv"): track = line.split("\t")[2] TRACKS.tracks.append(PipelineTracks.Sample3(filename=track)) ################################################################### def connect(): '''connect to database. Use this method to connect to additional databases. Returns a database connection. ''' dbh = sqlite3.connect(PARAMS["database"])
"pipeline_annotations.py", on_error_raise=__name__ == "__main__") # link up with ancestral repeats PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"], "pipeline_ancestral_repeats.py") ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks # collect sra nd fastq.gz tracks TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz")) TRACKS_CONTROL = PipelineTracks.Tracks( PipelineTracks.Sample).loadFromDirectory( ("repeats.gtf.gz", "introns.gtf.gz"), "(\S+).gtf.gz") TRACKS_META = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( ("merged.gtf.gz", ), "(\S+).gtf.gz") TRACKS_GENESETS = PipelineTracks.Tracks( PipelineTracks.Sample).loadFromDirectory(("genes.gtf.gz", ), "(\S+).gtf.gz") # collection of all tracks including controls TRACKS_WITH_CONTROLS = TRACKS + TRACKS_CONTROL
################################################################### ################################################################### ################################################################### ## ################################################################### if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec')) PARAMS = P.getParameters() ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=(".mapped.gtf.gz", )) ##################################################################### ##################################################################### ##################################################################### @transform(TRACKS.getTracks("%s.gtf.gz"), suffix(".gtf.gz"), '.psl.gz') def convertGtf2Psl(infile, outfile): """convert a gtf to a psl file. This method only takes features of type 'exon' and skips all contigs that are not in the genome sequence (for example the variant human chromosomes). """
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file): '''Generate a .fasta file of adaptor sequences that are overrepresented in the reads from a sample. Requires cutadapt >= 1.7. Arguments --------- infile : string Input filename that has been QC'ed. The filename is used to check if the input was a :term:`sra` file and guess the number of tracks to check. outfile : string Output filename in :term:`fasta` format. track : string Track name, used to access FastQC results in database. dbh : object Database handle. contaminants_file : string Path of file containing contaminants used for screening by Fastqc. ''' tracks = [track] if infile.endswith(".sra"): # patch for SRA files, look at multiple tracks f, fastq_format, datatype = Sra.peek(infile) if len(f) == 2: tracks = [track + "_fastq_1", track + "_fastq_2"] elif infile.endswith(".fastq.1.gz"): tracks = [track + "_fastq_1", track + "_fastq_2"] elif infile.endswith(".fastq.gz"): tracks = [track] found_contaminants = [] for t in tracks: table = PipelineTracks.AutoSample(os.path.basename(t)).asTable() # if sample name starts with a number, sql table will have # prepended "_" if re.match("^\d+.*", table): table = "_" + table query = '''SELECT Possible_Source, Sequence FROM %s_fastqc_Overrepresented_sequences;''' % table cc = dbh.cursor() # if there is no contamination table for even a single sample # it will prevent the whole pipeline progressing try: found_contaminants.extend(cc.execute(query).fetchall()) except sqlite3.OperationalError: E.warn("No table found for {}".format(t)) if len(found_contaminants) == 0: P.touch(outfile) return # read contaminants from existing file with IOTools.openFile(contaminants_file, "r") as inf: known_contaminants = [l.split() for l in inf if not l.startswith("#") and l.strip()] known_contaminants = {" ".join(x[:-1]): x[-1] for x in known_contaminants} # output the full sequence of the contaminant if found # in the list of known contaminants, otherwise don't report! matched_contaminants = set() with IOTools.openFile(outfile, "w") as outf: for found_source, found_seq in found_contaminants: possible_source = found_source.split(" (")[0] if possible_source in known_contaminants: matched_contaminants.update((possible_source,)) else: pass if len(matched_contaminants) > 0: for match in matched_contaminants: outf.write(">%s\n%s\n" % (match.replace(" ,", ""), known_contaminants[match]))
PipelineMedip.PARAMS = PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "cfastq.1.gz", "csfasta.gz"] TRACKS = sum( itertools.chain([ PipelineTracks.Tracks(Sample).loadFromDirectory([ x for x in glob.glob("*.%s" % s) if PARAMS["tracks_control"] not in x ], "(\S+).%s" % s) for s in suffixes ]), PipelineTracks.Tracks(Sample)) ################################################################### ################################################################### ################################################################### # if conf.py exists: execute to change the above assignmentsn if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec')) ################################################################### ################################################################### ################################################################### # define aggregates
from CGATCore import Pipeline as P import CGATPipelines.PipelineTracks as PipelineTracks # load options from the config file P.getParameters([ "%s/pipeline.ini" % __file__[:-len(".py")], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("medip_*"), "medip_(\S+)") def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close()
PipelineMotifs.PARAMS = PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks # determine the location of the input files (reads). DATADIR = PARAMS.get('input', '.') if not os.path.exists(DATADIR): raise OSError('data directory %s does not exists') Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob(os.path.join(DATADIR, "*.bed.gz")), "(\S+).bed.gz") BEDFILES = [os.path.join(DATADIR, "%s.bed.gz") % x for x in TRACKS] # create an indicator target @transform(BEDFILES, suffix(".gz"), ".gz") def BedFiles(infile, outfile): pass BAMFILES = glob.glob(os.path.join(DATADIR, "*.bam")) def getAssociatedBAMFiles(track): '''return a list of BAM files associated with a track.
################################################################### ################################################################### # parameterization EXPORTDIR = P.get('readqc_exportdir', P.get('exportdir', 'export')) DATADIR = P.get('readqc_datadir', P.get('datadir', '.')) DATABASE = P.get('readqc_backend', P.get('sql_backend', 'sqlite:///./csvdb')) ################################################################### # cf. pipeline_rnaseq.py # This should be automatically gleaned from pipeline_rnaseq.py ################################################################### import CGATPipelines.PipelineTracks as PipelineTracks TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.sra" % DATADIR), "(\S+).sra") +\ PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.fastq.gz" % DATADIR), "(\S+).fastq.gz") +\ PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("%s/*.fastq.1.gz" % DATADIR), "(\S+).fastq.1.gz") +\ PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.csfasta.gz"), "(\S+).csfasta.gz") ########################################################################### class ReadqcTracker(TrackerSQL): '''Define convenience tracks for plots''' def __init__(self, *args, **kwargs): TrackerSQL.__init__(self, *args, backend=DATABASE, **kwargs)
dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close() return dbh class MySample(PipelineTracks.Sample): attributes = tuple(PARAMS["attributes"].split(",")) TRACKS = PipelineTracks.Tracks(MySample).loadFromDirectory( glob.glob("*.bam"), "(\S+).bam") Sample = PipelineTracks.AutoSample DESIGNS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob("*.design.tsv"), "(\S+).design.tsv") ################################################################### ################################################################### ################################################################### # DEXSeq workflow ################################################################### @mkdir("results.dir") @files(PARAMS["annotations_interface_geneset_all_gtf"], "geneset_flat.gff") def buildGff(infile, outfile):