################################################################### # cf. pipeline_chipseq.py # This should be automatically gleaned from pipeline_chipseq.py ################################################################### PARAMS_PIPELINE = Pipeline.peekParameters(".", "pipeline_chipseq.py") Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "fastq.1.gz", "csfasta.gz"] TRACKS = sum( itertools.chain([ PipelineTracks.Tracks(Sample).loadFromDirectory([ x for x in glob.glob("%s/*.%s" % (DATADIR, s)) if "input" not in x ], "%s/(\S+).%s" % (DATADIR, s)) for s in suffixes ]), PipelineTracks.Tracks(Sample)) Sample.setDefault("asTable") ALL = PipelineTracks.Aggregate(TRACKS) EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) TISSUES = PipelineTracks.Aggregate(TRACKS, labels=("tissue", )) ############################################################################ # The folllowing need to be parameterized in a config file # TISSUES=["GM00855", "GM00861" ] # CONDITIONS=["D3", "unstim" ] # REPLICATES=["R1", "R2" ]
import sqlite3 from cgatcore import pipeline as P import cgatPipelines.PipelineTracks as PipelineTracks # load options from the config file P.getParameters(["%s/pipeline.ini" % __file__[:-len(".py")], "../pipeline.ini", "pipeline.ini"]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") Sample = PipelineTracks.Sample TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory(glob.glob("medip_*"), "medip_(\S+)") def connect(): '''connect to database. This method also attaches to helper databases. ''' dbh = sqlite3.connect(PARAMS["database_name"]) statement = '''ATTACH DATABASE '%s' as annotations''' % ( PARAMS["annotations_database"]) cc = dbh.cursor() cc.execute(statement) cc.close()
PipelineMedip.PARAMS = PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 suffixes = ["export.txt.gz", "sra", "fastq.gz", "cfastq.1.gz", "csfasta.gz"] TRACKS = sum( itertools.chain([ PipelineTracks.Tracks(Sample).loadFromDirectory([ x for x in glob.glob("*.%s" % s) if PARAMS["tracks_control"] not in x ], "(\S+).%s" % s) for s in suffixes ]), PipelineTracks.Tracks(Sample)) ################################################################### ################################################################### ################################################################### # if conf.py exists: execute to change the above assignmentsn if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec')) ################################################################### ################################################################### ################################################################### # define aggregates
"*.sra", "*.export.txt.gz", "*.csfasta.gz", "*.csfasta.F3.gz", ) SEQUENCEFILES = tuple( [os.path.join(DATADIR, suffix_name) for suffix_name in SEQUENCESUFFIXES]) SEQUENCEFILES_REGEX = regex( r"(\S+)-(\S+)-(\S+).(?P<suffix>fastq.1.gz|fastq.gz|sra)") Sample = PipelineTracks.AutoSample Sample.attributes = ('tissue', 'condition', 'replicate') TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( [y for x in SEQUENCESUFFIXES for y in glob.glob(x)], "(\S+).(fastq.1.gz|fastq.gz|sra)") EXPERIMENTS = PipelineTracks.Aggregate(TRACKS, labels=("tissue", "condition")) CONDITIONS = PipelineTracks.Aggregate(TRACKS, labels=("condition", )) REPLICATES = PipelineTracks.Aggregate(TRACKS, labels=("replicate", )) ######################################################################### # summarise read 3' ######################################################################### @follows(mkdir("sequence_characteristics.dir")) @transform(SEQUENCEFILES, SEQUENCEFILES_REGEX, r"sequence_characteristics.dir/\1-\2-\3.\g<suffix>_start.tsv") def summariseReadStart(infile, outfile):
################################################################### ################################################################### ################################################################### ## ################################################################### if os.path.exists("pipeline_conf.py"): L.info("reading additional configuration from pipeline_conf.py") exec(compile(open("pipeline_conf.py").read(), "pipeline_conf.py", 'exec')) PARAMS = P.getParameters() ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=(".mapped.gtf.gz", )) ##################################################################### ##################################################################### ##################################################################### @transform(TRACKS.getTracks("%s.gtf.gz"), suffix(".gtf.gz"), '.psl.gz') def convertGtf2Psl(infile, outfile): """convert a gtf to a psl file. This method only takes features of type 'exon' and skips all contigs that are not in the genome sequence (for example the variant human chromosomes). """
# load options from the config file P.getParameters([ "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini" ]) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.fastq.gz"), "(\S+).fastq.gz") USECLUSTER = True ################################################################### ################################################################### ################################################################### def connect(): '''connect to database. Use this method to connect to additional databases. Returns a database connection. '''
################################################### # Pipeline configuration # load options from the config file from cgatcore import pipeline as P P.getParameters( ["%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini", "pipeline.ini"]) PARAMS = P.PARAMS ################################################################### # Helper functions mapping tracks to conditions, etc GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz") TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3) TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam") REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", )) TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue")) def connect(): '''connect to database. Use this method to connect to additional databases. Returns a database connection. '''
USECLUSTER = True P.getParameters(["%s.ini" % os.path.splitext(__file__)[0], "pipeline.ini"]) PARAMS = P.PARAMS ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = PipelineTracks.Sample3 #TRACKS = PipelineTracks.Tracks( PipelineTracks.Sample3 ).loadFromDirectory( [x for x in glob.glob( "*.fastq.gz" ) if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" ) TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.export.txt.gz") if PARAMS["tracks_control"] not in x], "(\S+).export.txt.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.sra") if PARAMS["tracks_control"] not in x], "(\S+).sra" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.fastq.gz") if PARAMS["tracks_control"] not in x], "(\S+).fastq.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "") for x in glob.glob("*.fastq.1.gz") if PARAMS["tracks_control"] not in x], "(\S+).fastq.1.gz" ) +\ PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory( [x.replace("../", "")
# link up with annotations PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"], "pipeline_annotations.py") # link up with ancestral repeats PARAMS_ANCESTRAL_REPEATS = P.peekParameters(PARAMS["ancestral_repeats_dir"], "pipeline_ancestral_repeats.py") ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # collect sra nd fastq.gz tracks TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( glob.glob("*.gtf.gz"), "(\S+).gtf.gz", exclude=("repeats.gtf.gz", "introns.gtf.gz", "merged.gtf.gz")) TRACKS_CONTROL = PipelineTracks.Tracks( PipelineTracks.Sample).loadFromDirectory( ("repeats.gtf.gz", "introns.gtf.gz"), "(\S+).gtf.gz") TRACKS_META = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory( ("merged.gtf.gz", ), "(\S+).gtf.gz") TRACKS_GENESETS = PipelineTracks.Tracks( PipelineTracks.Sample).loadFromDirectory(("genes.gtf.gz", ), "(\S+).gtf.gz") # collection of all tracks including controls TRACKS_WITH_CONTROLS = TRACKS + TRACKS_CONTROL
"pipeline.ini"]) PARAMS = P.PARAMS PARAMS.update(P.peekParameters( PARAMS["annotations_dir"], "pipeline_annotations.py", prefix="annotations_", update_interface=True)) # Helper functions mapping tracks to conditions, etc Sample = PipelineTracks.AutoSample # define tracks based on all samples in .bamfile that are not input or index TRACKS = PipelineTracks.Tracks(Sample).loadFromDirectory( glob.glob(os.path.join(PARAMS.get("location_bamfiles", ""), "*.bam")), "(\S+).bam", exclude=[".+input.+"]) @files(None, None) def printTracks(infile, outfile): P.warn("\n\n\n\nprinting tracks:") for track in EXPERIMENTS: print("\t") print(track) def get_peak_caller_parameters(peak_caller_id): """ Returns a dictionary of config file parameters for the chosen peak caller (an attempt to keep access to PARAMS out of associated pipeline script).