def main(argv): def _add_input(parser): parser.add_option("--data-dir", default=".") parser.add_option("--force", default=False, action="store_true") parser.add_option("--min-depth", default=0, type="int") parser.add_option("--follow-links", default=False, action="store_true") parser.add_option("--limit-metrics", default=0, type="int") parser.add_option("--output-filename-metrics") parser.add_option("--input-filename-metrics") P.initialize(argv, callback=_add_input) options = E.get_args() if options.config_file: PARAMS = P.get_parameters(options.config_file) else: sys.exit(P.main(options)) if os.path.exists("results.commit"): if not options.force: raise ValueError( "a results.commit file already exists. Please remove " "before uploading.") data_dir = os.path.abspath(options.data_dir) if options.input_filename_metrics: with IOTools.open_file(options.input_filename_metrics) as inf: infiles = [x.strip() for x in inf.readlines() if x.strip()] if options.limit_metrics: infiles = infiles[:options.limit_metrics] else: E.info(f"collecting files to upload starting in {data_dir}") infiles = [] for root, dirs, files in os.walk(data_dir, followlinks=options.follow_links): E.debug(f"working on {root}: dirs={len(dirs)}, files={len(files)}") # ignore first level (tools) (needs better check) depth = root[len(data_dir):].count(os.sep) if "benchmark.info" in files: if depth <= options.min_depth: E.info(f"skipping - depth not high enough: {depth}") else: infiles.append(os.path.join(root, "benchmark.info")) if options.limit_metrics and len(infiles) > options.limit_metrics: E.info(f"stopping collection as {len(infiles)} reached") break E.info("found a potential {} benchmark.info files to upload".format(len(infiles))) if options.output_filename_metrics: with IOTools.open_file(options.output_filename_metrics, "w") as outf: outf.write("\n".join(infiles) + "\n") # find all files of interest oldwd = os.getcwd() os.chdir(data_dir) upload_result(infiles, "results.commit", PARAMS) os.chdir(oldwd) E.stop()
def test_job_should_fail_if_cancelled(self): if not P.will_run_on_cluster(P.get_parameters()): return if QUEUE_MANAGER == "slurm": self.assertRaises( OSError, P.run, "scancel $SLURM_JOB_ID", to_cluster=self.to_cluster) elif QUEUE_MANAGER == "sge": self.assertRaises( OSError, P.run, "qdel $SGE_TASK_ID", to_cluster=self.to_cluster)
def test_job_should_fail_if_too_little_memory_required(self): outfile = os.path.join(self.work_dir, "out") if P.get_parameters()['os'] == 'Linux': self.assertRaises( OSError, P.run, "python -c 'import numpy; " "a = numpy.array(numpy.arange(0, {memory}), numpy.int8); " "out = open(\"{outfile}\", \"w\"); " "out.write(str(len(a)) + \"\\n\"); " "out.close()'".format( memory=self.test_memory_size, outfile=outfile), to_cluster=self.to_cluster, cluster_memory_ulimit=True, job_memory="{}G".format( 0.5 * self.test_memory_size / 10**9)) else: pass
def test_job_should_fail_if_too_little_memory_required_in_second_statement(self): outfile = os.path.join(self.work_dir, "out") infile = "arv=by_id/glon1-4zz18-3cbje7tmr0nitut/study_list.txt" if P.get_parameters()['os'] == 'Linux': self.assertRaises( OSError, P.run, "hostname > {outfile}; " "python -c 'import numpy; " "a = numpy.array(numpy.arange(0, {memory}), numpy.int8); " "out = open(\"{outfile}\", \"w\"); " "out.write(str(len(a)) + \"\\n\"); " "out.close()'".format( memory=self.test_memory_size, infile=infile, outfile=outfile), to_cluster=self.to_cluster, cluster_memory_ulimit=True, job_memory="{}G".format( 0.5 * self.test_memory_size / 10**9)) else: pass
def count_genes(infile, outfile): statement = '''wc -l %(infile)s > %(outfile)s''' #counts no. of lines, s specifies string. Each line = transcript P.run(statement) #if on cbrg have to specify job queue. Will run and save to single file per chromosome #or can do len(chr1.gtf) - count number of lines in file @merge(count_genes,'all.average') def average (infiles, outfile): #each count file has 2 items e.g 100 (no. of transcripts) chr1.gtf (original file name) total_counts = {} #create dictionary for infile in infiles: with open (infile) as inf: count, chrom = inf.read().strip().split(' ') #.read = reading the line, .strip = taking away white space, .split = setting 2 variables total_counts[chrom] = int(count) #key = chrom, count (make integer) = value median = statistics.median(total_counts.values()) #calculate median from integers only with open(outfile,'w') as count: for key, value in total_counts.items(): count.write(f'{key}\t{value}\n') #write dictionary count.write(f'Median\t{median}\n') #write median if __name__ == "__main__": sys.exit( P.main(sys.argv) ) #if have this at bottom can run in command line eg $python workflow.py make for full pipeline #If want to set input file using .yml then make .yml file with (save .yml file in same directory): gtf = /ifs/obds-training/lingf/obds/devel/OBDS_Training_Sep_2019/genes.gtf.gz #Then under import statements P.get_parameters('workflow.yml') #And have to change decorators e.g @split(P.PARAMS['gtf'], 'chr*.gtf') #end up with dictionary, gtf is key and value is file path
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--restrict-regex", dest="restrict_regex", action="append", help="pattern to restrict tests to certain tools/metrics. " "Can be specified multiple times [%default]") parser.add_option( "--data-directory", dest="data_directory", help="directory with sample data sets. This will override the default " "datadir in the configuration file and the environment variable " "DAISY_TEST_DATADIR [%default]") parser.add_option( "--library-directory", dest="library_directory", action="append", help="directory tasks functions. Will be added to the built-in " "and the one specified in DAISY_TASKLIBRARY environment variable " "[%default]") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.add_option("--keep-failed-temp", dest="keep_failed_temp", action="store_true", help="keep temporary files of failed tests [%default]") parser.set_defaults( restrict_regex=[], always_mount=False, data_directory=None, keep_failed_temp=False, library_directories=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) P.get_parameters() # load the built-in tests filenames = [ os.path.join(os.path.dirname(os.path.dirname(__file__)), "tasks", "test_task_library.yml") ] if "DAISY_TASKLIBRARY" in os.environ: filenames.append( os.path.join(os.environ["DAISY_TASKLIBRARY"], "test_task_library.yml")) filenames.extend(options.library_directories) master_config = None for fn in filenames: if not os.path.exists(fn): E.warn("file {} does not exist".format(fn)) continue with IOTools.open_file(fn) as inf: raw_txt = inf.read() test_config = yaml.load(raw_txt, Loader=yaml.FullLoader) if test_config is None: E.warn("file {} is empty".format(fn)) continue data_directory = os.environ.get("DAISY_TEST_DATADIR", test_config.get("data_directory")) if options.data_directory: data_directory = options.data_directory # reload config with placeholders replaced test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt), Loader=yaml.FullLoader) if master_config is None: master_config = test_config else: # add additional tool/test metrics master_config["tool"].update(test_config.get("tool", {})) master_config["metric"].update(test_config.get("metric", {})) for test_section, testclass, map_name_to_runner in [ ("tool", TestTool, map_tool_to_runner), ("metric", TestMetric, map_metric_to_runner) ]: ignore = master_config[test_section].get("ignore", []) # propagate config variables testclass.test_config = master_config for task, taskf in sorted(map_name_to_runner.items()): found = False for to_ignore in ignore: if re.match(to_ignore, task): found = True if found: continue if options.restrict_regex: take = False for x in options.restrict_regex: if re.search(x, task): take = True if not take: continue add_tests(task, taskf, testclass) failed = False with arvados_enabled(always_mount=options.always_mount): for testclass in [TestTool, TestMetric]: suite = unittest.TestLoader().loadTestsFromTestCase(testclass) result = unittest.TextTestRunner(verbosity=2).run(suite) failed |= not result.wasSuccessful() # remove all tests in test class - necessary if function is # called repeatedly clear_tests(testclass) E.stop() return failed
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Sep 30 15:37:44 2019 @author: asmith """ import os import sys import numpy as np import pandas as pd from ruffus import * from cgatcore import pipeline as P # Put parameter YAML here P.get_parameters('RNA_seq.yml') @follows(mkdir('fastqc')) @transform('*.fastq.gz', regex(r'(.*_.*).fastq.gz'), r'fastqc/\1_fastqc.zip') def run_fastqc(infile, outfile): cmd = 'fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc/' P.run(cmd, job_queue=P.PARAMS['queue'], job_threads=P.PARAMS['threads']) @follows(mkdir('bam')) @collate('*.fastq.gz', regex(r'(.*)_[1|2].fastq.gz'), r'bam/\1.bam') def align_fastq_paired(infiles, outfile): if not P.PARAMS['hisat_options']: P.PARAMS['hisat_options'] = ''
def main(argv=None): # Parse the options parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-p", "--params", "--args", dest="params", type="string", help="comma separated list of addtional parameter strings") parser.add_option("-m", "--module", dest="module", type="string", help="the full path to the module file", default=None) parser.add_option("-i", "--input", dest="input_filenames", type="string", action="append", help="input filename") parser.add_option("-o", "--output-section", dest="output_filenames", type="string", action="append", help="output filename") parser.add_option("-f", "--function", dest="function", type="string", help="the module function", default=None) parser.set_defaults(input_filenames=[], output_filenames=[], params=None) (options, args) = E.start(parser) # Check a module and function have been specified if not options.module or not options.function: raise ValueError("Both a function and Module must be specified") # initialize defaults P.get_parameters() # If a full path was given, add this path to the system path location = os.path.dirname(options.module) if location != "": sys.path.append(location) # Establish the module name, accomodating cases where the # .py extension has been included in the module name module_name = os.path.basename(options.module) if module_name.endswith(".py"): module_base_name = module_name[:-3] else: module_base_name = module_name # Import the specified module and map the specified fuction E.info("importing module '%s' " % module_base_name) E.debug("sys.path is: %s" % sys.path) module = importlib.import_module(module_base_name) try: function = getattr(module, options.function) except AttributeError as msg: raise AttributeError( msg.message + "unknown function, available functions are: %s" % ",".join([x for x in dir(module) if not x.startswith("_")])) if options.input_filenames and not options.input_filenames == ["None"]: infiles = options.input_filenames else: infiles = False if options.output_filenames and not options.output_filenames == ["None"]: outfiles = options.output_filenames else: outfiles = False # Parse the parameters into an array if options.params: params = [param.strip() for param in options.params.split(",")] else: params = False # deal with single file case if infiles and len(infiles) == 1: infiles = infiles[0] if outfiles and len(outfiles) == 1: outfiles = outfiles[0] # Make the function call if infiles and outfiles and params: function(infiles, outfiles, params) elif infiles and outfiles and not params: function(infiles, outfiles) elif params: function(params) else: raise ValueError( "Expecting infile+outfile+params or infile+outfile or params") E.stop()
from cgatcore import pipeline as P import os import sqlite3 import re import pandas as pd import gzip # Pipeline configuration P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ], ) PARAMS = P.PARAMS db = PARAMS['database']['url'].split('./')[1] ##################################################### #### Helper functions #### ##################################################### def isPaired(files): '''Check whether input files are single or paired end Note: this is dependent on files having correct suffix''' paired = [] for fastq in files: Fpair = re.findall(".*.fastq.1.gz", fastq) paired = paired + Fpair
#load modules from ruffus import * import os import sys, re import subprocess ################################################### ################################################### ################################################### # Pipeline configuration ################################################### # load options from the config file import cgatcore.pipeline as P P.get_parameters([ "%s/pipeline.yml" % __file__[:-len(".py")], "../pipeline.yml", "pipeline.yml" ]) PARAMS = P.PARAMS from pipeline_assembly import PipelineAssembly from pipeline_annotate import PipelineAnnotate #get all files within the directory to process SEQUENCEFILES = ("*.fasta", "*.fasta.gz", "*.fasta.1.gz", "*.fasta.1", "*.fna", "*.fna.gz", "*.fna.1.gz", "*.fna.1", "*.fa", "*.fa.gz", "*.fa.1.gz", "*.fa.1", "*.fastq", "*.fastq.gz", "*.fastq.1.gz", "*.fastq.1") SEQUENCEFILES_REGEX = regex( r"(\S+).(fasta$|fasta.gz|fasta.1.gz|fasta.1|fna$|fna.gz|fna.1.gz|fna.1|fa$|fa.gz|fa.1.gz|fa.1|fastq$|fastq.gz|fastq.1.gz|fastq.1)" )
""" #Capture-C pipeline exercise - files required: read1.fastq and read2.fastq, fragment.txt Hba-1 mouse globin locus #fastqc code as before #trimming using trim-galore - need to use collate decorator as need to put in reads as pair for trimming #FLASH (Fast Length Adjustment of SHort reads) is a very fast and accurate software tool to merge paired-end reads from #next-generation sequencing experiments. FLASH is designed to merge pairs of reads when the original DNA fragments are shorter than #twice the length of reads. The resulting longer reads can significantly improve genome assemblies. #@collate is for collating pairs or groups; @merge is to merge all samples, irrespective of grouping import sys from cgatcore import pipeline as P from ruffus import * P.get_parameters('capturec_pipeline.yml') @follows(mkdir('fastqc')) @transform('*.fastq.gz', regex(r'(.*).fastq.gz'),r'fastqc/\1_fastqc.html') def qc_reads(infile, outfile): statement = 'fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc' P.run(statement, job_queue = P.PARAMS['queue'], job_memory = P.PARAMS['memory'] job_threads = P.PARAMS['threads']) @follows (mkdir('trim')) @collate('*.fastq.gz', regex(r'(.*)_[1-2].fastq.gz'), r'trim/\1_1_val_1.fq.gz') def trim(infiles, outfile): ''' Trim fastq files''' fq1, fq2 = infiles
''' run cellranger on 10X fastq files ''' import gzip import re import pandas as pd from ruffus import * from cgatcore import pipeline as P import sys params = P.get_parameters("pipeline_cellranger.yml") # define params here samples = pd.read_csv("cellranger_samples.csv") samples.set_index('name', inplace=True) print(samples) @follows(mkdir("count")) @transform('data/*/.sample', regex(r'data/(.+)/.sample'), r'count/\1/outs/filtered_feature_bc_matrix.h5') def cellranger_count(infile, outfile): # python module looking for regular expression, group(1) is equiv to '\1' sampleid = re.search('data/(.+)/.sample', infile).group(1) print(sampleid) fastqs = samples['fastqs'][sampleid] cellnumber = samples['cells'][sampleid] chemistry = samples['chemistry'][sampleid] statement = '''cellranger count --id=%(sampleid)s --transcriptome=%(cellrangercount_transcriptome)s --fastqs=%(fastqs)s
import cgatcore.experiment as E import cgatcore.iotools as IOTools import cgatcore.database as Database import cgat.FastaIterator as FastaIterator import numpy as np from PipelinePrimerDesign import PrimerSet ################################################### ################################################### ################################################### ## Pipeline configuration ################################################### # load options from the config file import cgatcore.pipeline as P P.get_parameters( [ "pipeline.yml" ] ) PARAMS = P.PARAMS ################################################### ################################################### ################################################### def readIdentifiers(identifiers): ''' return list of identifiers from file ''' ids = [x.strip("\n") for x in IOTools.open_file(identifiers).readlines()] return ids
'pipeline_docs', 'themes') logopath = os.path.join(themedir, "cgat_logo.png") ################################################################ # Import pipeline configuration from pipeline.ini in the current # directory and the common one. # PATH were code for pipelines is stored pipelinesdir = os.path.dirname(cgatpipelines.__file__) # The default configuration file - 'inifile' is read by # sphinx-report. inifile = os.path.join(os.path.dirname(cgatpipelines.__file__), 'configuration', 'pipeline.yml') PARAMS = P.get_parameters([inifile, "pipeline.yml"]) # Definition now part of cgatReport # def setup(app): # app.add_config_value('PARAMS', {}, True) ################################################################ ################################################################ ################################################################ # The pipeline assumes that sphinxreport is called within the # working directory. If the report is in a separate build directory, # change the paths below. # # directory with export directory from pipeline # This should be a directory in the build directory - you can # link from here to a directory outside the build tree, though.
merge, originate, collate, regex, add_inputs, active_if, ) from cgatcore.iotools import zap_file, touch_file from utils import is_none, is_on ################## # Pipeline setup # ################## # Read in parameter file P.get_parameters("config_rna.yml") # Small edits to config to enable cluster usage P.PARAMS["cluster_queue_manager"] = P.PARAMS.get("pipeline_cluster_queue_manager") P.PARAMS["conda_env"] = os.path.basename(os.environ["CONDA_PREFIX"]) # Make sure that params dict is typed correctly for key in P.PARAMS: if is_none(P.PARAMS[key]): P.PARAMS[key] = None elif is_on(P.PARAMS): P.PARAMS[key] = True # Global variables CREATE_BIGWIGS = P.PARAMS.get("run_options_bigwigs")
def main(argv=None): parser = get_option_parser() (options, args) = E.start(parser, add_cluster_options=True) if len(args) == 0: raise ValueError( "command line argument missing - see usage information") options.renumber_column = [x.split(":") for x in options.renumber_column] cmd = args[0] if len(args) > 1: cmd += " '" + "' '".join(args[1:]) + "'" if options.dry_run: cmd = re.sub("%DIR%", "", cmd) retcode = subprocess.call(cmd, shell=True, stdin=sys.stdin, stdout=sys.stdout, cwd=os.getcwd(), close_fds=True) E.stop() sys.exit(0) failed_requests = [] started_requests = [] niterations = 0 P.get_parameters() P.start_session() if not options.collect: tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir)) E.info(" working in directory %s" % tmpdir) if options.split_at_lines: chunk_iterator = chunk_iterator_lines args = (options.split_at_lines, ) elif options.split_at_column: chunk_iterator = chunk_iterator_column args = (options.split_at_column - 1, options.max_files) elif options.split_at_regex: chunk_iterator = chunk_iterator_regex_split args = (re.compile(options.split_at_regex), 0, options.chunksize, options.max_lines) elif options.group_by_regex: chunk_iterator = chunk_iterator_regex_group args = (re.compile(options.group_by_regex), 0, options.chunksize) else: raise ValueError("please specify a way to chunk input data") data = [(x, cmd, options, None, options.subdirs) for x in chunk_iterator(options.stdin, args, prefix=tmpdir, use_header=options.input_header)] statements = [build_command(x) for x in data] started_requests = [(x[0], x[0] + ".out") for x in data] if len(data) == 0: E.warn("no data received") E.stop() sys.exit(0) P.run(statements) else: tmpdir = options.collect started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")] E.info("collecting %i files from %s" % (len(started_requests), tmpdir)) if failed_requests: for fn, cmd in failed_requests: E.error("failed request: filename= %s, cmd= %s" % (fn, cmd)) else: E.info("building result from %i parts" % len(started_requests)) if options.renumber: mapper = MapperLocal(pattern=options.renumber) else: mapper = MapperEmpty() # deal with stdout name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, "stdout"): try: index = int(column) - 1 except ValueError: name = column break if options.binary: ResultBuilderBinary()(started_requests, options.stdout, options) else: regex = None if options.output_regex_header: regex = re.compile(options.output_regex_header) ResultBuilder(mapper=mapper, field_index=index, field_name=name, header_regex=regex)(started_requests, options.stdout, options) # deal with logfiles : combine them into a single file rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if rr: E.info("logging output goes to %s" % rr.groups()[0]) logfile = iotools.open_file(rr.groups()[0], "a") ResultBuilderLog()([(x[0], "%s.log" % x[0]) for x in started_requests], logfile, options) logfile.close() # deal with other files if options.subdirs: files = glob.glob("%s/*.dir/*" % tmpdir) # remove directory filenames = set([os.path.basename(x) for x in files]) xx = len(".out") for filename in filenames: _, filetype = os.path.splitext(filename) name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, filename): try: index = int(column) - 1 except ValueError: name = column break if options.binary: builder = ResultBuilderBinary(mapper=mapper) elif filetype in (".fa", ".fasta"): builder = ResultBuilderFasta(mapper=mapper) elif filetype in (".mali", ): builder = ResultBuilderFasta(mapper=MapperEmpty()) elif filetype in (".png"): builder = ResultBuilderCopies(mapper=mapper) else: builder = ResultBuilder(mapper=mapper, field_index=index, field_name=name) E.debug("chose the following builder for %s: %s: %s" % (filename, filetype, str(builder))) E.info("collecting results for %s" % filename) input_filenames = [] for fi, fn in started_requests: fn = fn[:-xx] + ".dir/" + filename if os.path.exists(fn): input_filenames.append((fi, fn)) E.info("output of %i files goes to %s" % (len(filenames), filename)) outfile = iotools.open_file(options.output_pattern % filename, "w") builder(input_filenames, outfile, options) outfile.close() if not options.debug and (not options.resume or not options.collect): if len(failed_requests) == 0: E.info("removing directory %s" % tmpdir) shutil.rmtree(tmpdir) else: E.info("directory %s not removed due to %i failed jobs" % (tmpdir, len(failed_requests))) E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" % (len(started_requests), len(started_requests) - len(failed_requests), len(failed_requests), niterations)) E.stop()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue May 12 10:16:38 2020 @author: sumeet """ from ruffus import * from cgatcore import pipeline as P import sys #Import parameters Params = P.get_parameters("pipeline_rna_seq.yml") #This part is the fastq to generate fastqc html files @transform('*.fastq.gz', suffix('.fastq.gz'), '_fastqc.html') def fastqc(infile, outfile): statement = '''fastqc %(infile)s > %(outfile)s.log''' P.run(statement) #Next want to do multiqc to make a nice report containing all the fastqc from each fastq file #We want to merge the input from fastqc files into multiqc report #First use decorator @follows to make a directory for output #use the input from fastqc in the merge function, output will be multiqc.html report #define multiqc function, need multiple infiles #run this like . - to look in cd #Name the output file usin -n and specify output directory using -o @follows(mkdir('multiqc_reports'))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ ChIP-seq pipeline """ #copy files from shared to my directory. They are already symbolic links so have to copy #cp -d /ifs/obds-training/apr20/shared/week3/chipseq/* . #will be rsyncing script #rsync -a /Users/rhodgson/GitHub/OBDS_Training_Apr_2020/chipseq_pipeline.* [email protected]:/ifs/obds-training/apr20/rose/pipelines/chipseqpipeline import sys from ruffus import * from cgatcore import pipeline as P params = P.get_parameters("chipseq_pipeline.yml") #fastq input #Do fastqc of the the files and put them in a new directory. name file using the first part of the regex #no group is an option for fastqc - where you have plots, it doesn't average over bases. give you bases over evry base. #params["q"] in param file - put the name of the q to allow you to put the pipeline from one to another q @follows(mkdir("fastqc")) @transform("*.fastq.gz", regex(r"(.*).fastq.gz"), r"fastqc/\1_fastqc.html") def fastqc(infile, outfile): statement = "fastqc --nogroup -o fastqc %(infile)s > %(outfile)s.log" P.run(statement, job_queue=params["q"]) #Next bit -multiqc
.. glossary:: Code ==== """ import sys import os import glob from pathlib import Path from ruffus import * from cgatcore import pipeline as P # load options from the config file PARAMS = P.get_parameters( ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "pipeline.yml"]) #get all files within the directory to process SEQUENCEFILES = ("*fastq.gz") SEQUENCEFILES_REGEX = regex(r"(\S+).(fastq.gz)") scriptsdir = os.path.dirname(os.path.abspath(__file__)) scriptsdir = P.snip(scriptsdir, "pipelines") + "scripts" PARAMS["scriptsdir"] = scriptsdir reportdir = os.path.dirname(os.path.abspath(__file__)) reportdir = os.path.join(reportdir, "pipeline_docs", "Rmd") PARAMS["reportdir"] = reportdir ########################################################
############################################## # Simple pipeline to run prokka on multiple # assemblies ############################################## ############################################## ############################################## from ruffus import * import cgatcore.pipeline as P import cgatcore.iotools as IOTools import os import sys import collections import cgatcore.experiment as E PARAMS = P.get_parameters(filenames=["pipeline.yml"]) ############################################## ############################################## ############################################## @follows(mkdir("annotations.dir")) @transform("*.fna.gz", regex("(\S+).fna.gz"), r"annotations.dir/\1/\1.tsv") def runProkka(infile, outfile): ''' run prokka annotations - very basic with no parameterisation ''' newdirname = os.path.join("annotations.dir", P.snip(infile, ".fna.gz")) job_memory = PARAMS["prokka_memory"]
def setUp(self): BaseTest.setUp(self) P.get_parameters()
"""Salmon alevin""" import re import pandas as pd from ruffus import * from cgatcore import pipeline as P import sys import glob import os params = P.get_parameters("project_alevin.yml") # define params here samples = pd.read_csv("cellranger_samples.csv") samples.set_index('name', inplace=True) print(samples) def get_gex_fastq(dir): '''Docstring''' fastq1_pattern = params["pattern"]["fastq1"] fastq1_glob = f"{dir}/*{fastq1_pattern}*" fastq1 = glob.glob(fastq1_glob) if len(fastq1) == 0: raise OSError(f"No file matched pattern: {fastq1_glob}") fastq2 = [ file.replace(params["pattern"]["fastq1"], params["pattern"]["fastq2"]) for file in fastq1 ] for file in fastq2: if not os.path.exists(file): raise OSError(f"Paired file not found: {file}") return {'fastq1': fastq1, 'fastq2': fastq2}
def connect(): '''connect to database. Use this method to connect to additional databases. Returns a database connection. ''' dbh = sqlite3.connect(PARAMS["database_name"]) return dbh ######################################################################### P.get_parameters( ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml"], defaults={ 'paired_end': False}, only_import=__name__ != "__main__") PARAMS = P.PARAMS mapping.PARAMS = PARAMS mappingqc.PARAMS = PARAMS exome.PARAMS = PARAMS ######################################################################### ######################################################################### # Load manual annotations #########################################################################
''' RNAseq pipeline process fastq file into count files/matrices ''' from ruffus import * from cgatcore import pipeline as P import sys params = P.get_parameters("rnaseq_pipeline.yml") @follows(mkdir("fastqc")) @transform("*.fastq.gz", regex(r'(.*).fastq.gz'), r'fastqc/\1_fastqc.html') def fastqc(infile, outfile): statement = "fastqc --nogroup -o fastqc %(infile)s " P.run(statement, job_queue='all.q', job_threads=1, job_memory='2G', job_condaenv='obds-py3') @merge(fastqc, r'fastqc/multiqc_report.html') def multiqc(infiles, outfile): statement = "multiqc -f -n %(outfile)s fastqc" P.run(statement, job_queue='all.q', job_threads=1, job_memory='2G', job_condaenv='obds-py3')
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-n", "--dry-run", dest="dry_run", action="store_true", help="only show what will be done, don't do it [%default]") parser.add_option("-l", "--link", dest="link", action="store_true", help="link instead of rename [%default]") parser.set_defaults(dry_run=False, link=False) (options, args) = E.start(parser, argv) config = P.get_parameters("benchmark.yml") old_data, new_data = [], [] for old_info in glob.glob("*.dir/tool.info"): old_dir, old_file = os.path.split(old_info) old_info = toolkit.read_data(old_info) old_data.append((old_dir, old_info)) tool_functions = workflow.build_tool_functions(map_tool_to_runner, config) config_files = workflow.expand_globs(config["input"]) input_combos = workflow.build_combinations(config_files) map_property_to_dir = collections.defaultdict(list) for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files) result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir")) new_data.append((result_dir, taskf)) for a, x, y in IOTools.nested_iter(taskf.input_files): map_property_to_dir[(x, y)].append(result_dir) map_property_to_dir[("name", taskf.name)].append(result_dir) for x, y in list(taskf._option_dict.items()): map_property_to_dir[(x, y)].append(result_dir) # match by input_files options.stdout.write("\t".join(("old", "new", "matching")) + "\n") for old_dir, old_info in old_data: targets = [] for a, x, y in IOTools.nested_iter(old_info["input_files"]): if (x, y) in map_property_to_dir: targets.extend(map_property_to_dir[(x, y)]) for x, y in list(old_info.items()): try: targets.extend(map_property_to_dir[(x, y)]) except TypeError: pass counts = collections.Counter(targets) max_count = max(counts.values()) max_count_items = [ x for x, y in list(counts.items()) if y == max_count ] if len(max_count_items) > 1: E.warn("multiple matches for {}, ignored".format(old_dir)) continue new_dir = max_count_items[0] options.stdout.write("\t".join(map(str, (old_dir, new_dir, max_count))) + "\n") if os.path.exists(new_dir): raise ValueError("directory {} already exists".format(new_dir)) if options.dry_run: continue if options.link: os.symlink(old_dir, new_dir) else: os.rename(old_dir, new_dir) E.stop()
queue: all.q threads: 12 memory: 8G bowtie2: options: ref: /ifs/mirror/genomes/bowtie/mm10 picard: ref: /ifs/mirror/genomes/plain/mm10.fasta """ import sys import gzip from cgatcore import pipeline as P from ruffus import * P.get_parameters('chipseq_pipeline.yml') @follows(mkdir('fastqc') ) #same as before, can run independently of other processes @transform('*.fastq.gz', regex(r'(.*).fastq.gz'), r'fastqc/\1_fastqc.html') def qc_reads(infile, outfile): statement = 'fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc' P.run(statement, job_queue=P.PARAMS['queue'], job_memory=P.PARAMS['memory']) @follows(mkdir('sam')) @collate('*.fastq.gz', regex(r'(.*)_[1-2].fastq.gz'), r'sam/\1.sam') def align_reads(infiles, outfile):
import cgatcore.experiment as E import cgatcore.iotools as iotools import cgatpipelines.tasks.motifs as motifs import cgatpipelines.tasks.tracks as tracks ################################################### ################################################### ################################################### # Pipeline configuration ################################################### from cgatcore import pipeline as P P.get_parameters([ "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", "pipeline.yml" ], defaults={'annotations_dir': ""}) PARAMS = P.PARAMS PARAMS_ANNOTATIONS = P.peek_parameters(PARAMS["annotations_dir"], "genesets") ################################################################### ################################################################### ################################################################### # Helper functions mapping tracks to conditions, etc ################################################################### # load all tracks - exclude input/control tracks Sample = tracks.Sample
@author: rhodgson """ #This pipeline is to run rnaseq pipelines from the fastq file from all #I think I will create my github repository here #rsync -a /Users/rhodgson/GitHub/OBDS_Training_Apr_2020/rnaseq_pipeline.py [email protected]:/ifs/obds-training/apr20/rose/pipelines/rnaseqpipeline #Now import section import sys from ruffus import * from cgatcore import pipeline as P import gzip #write parameters Params = P.get_parameters("pipeline.yml") #First going to do the fastqc on the fastq files #This will create fastqc.html files and fastqc.zip #It also feeds evertyhing into a fastqc.zip file @transform('*.fastq.gz', suffix('.fastq.gz'), '_fastqc.html') def fastqqc(infile, outfile): statement = '''fastqc %(infile)s> %(outfile)s.log''' P.run(statement) #main - will allow to run from cgat core - always goes to the end of the file if __name__ == "__main__":
#Import everything we need #Working directory: /ifs/obds-training/apr20/rose/pipelines/pseudoalignment #Make a directory with where we want the files- made a symbolic link to fastq files from/ifs/obds-training/apr20/exercises/rnaseq/ #will be rsyncing script: #rsync -a /Users/rhodgson/GitHub/obdsRNAseqpipeline/pseudoalignmentRH* [email protected]:/ifs/obds-training/apr20/rose/pipelines/pseudoalignment #We will be using kallisto from ruffus import * from cgatcore import pipeline as P import sys #So the first thing we still want to do is the QC of the fastq files (including multiQC) #see file pipeline_rna_seq.py for notes on these functions #Import parameters - sure this will change Params = P.get_parameters("pseudoalignmentRH.yml") #Fastqc - just added an output folder here #as this is a transform thing, we needed to use a regular expression to put these reports into a new folder @follows(mkdir('fastqc_reports')) @transform('*.fastq.gz', regex(r'(.*).fastq.gz'), r'fastqc_reports/\1_fastqc.html') def fastqc(infile, outfile): statement = '''fastqc --outdir fastqc_reports %(infile)s > %(outfile)s.log''' P.run(statement) #Multiqc -hmm @follows(mkdir('multiqc_reports'))
fastq matches the characters fastq literally (case sensitive) . matches any character (except for line terminators) gz matches the characters gz literally (case sensitive) Global pattern flags g modifier: global. All matches (don't return after first match) m modifier: multi line. Causes ^ and $ to match the begin/end of each line (not only begin/end of string) """ import gzip from ruffus import * from cgatcore import pipeline as P import sys import statistics P.get_parameters('rnaseq_pipeline.yaml') #.yml for cbrg, .yaml for cgat @follows(mkdir('fastqc')) #make fastqc folder before running code below @transform('*.fastq.gz', regex(r'(.*).fastq.gz'),r'fastqc/\1_fastqc.html') #find all fastq.gz files, save to fastqc folder and use name_fastqc.html def fastqc (infile, outfile): #next time call functions run_fastqc etc so don't have confused! statement = '''fastqc -q -t %(threads)s --nogroup %(infile)s --outdir fastqc''' #need to direct output P.run(statement, job_queue=P.PARAMS['queue'], job_threads=P.PARAMS['threads']) #only for setting run parameters eg memory, threads, queue @follows(mkdir('sam')) #make sam folder before running code below @collate('*.fastq.gz', regex(r'(.*)_[1-2].fastq.gz'), r'sam/\1.sam') #look for fastq.gz files with same name ending in _1 or _2, output as sam file def hisat2 (infiles, outfile): read1, read2 = infiles #2 infiles statement = '''hisat2 -p %(threads)s %(hisat_option)s -x %(hisat_ref)s #hisat options in yaml file -1 %(read1)s -2 %(read2)s -S %(outfile)s''' P.run(statement, job_queue=P.PARAMS['queue'], job_threads=P.PARAMS['threads'], job_memory ='8G')