def findMesosBinary(self, name): try: return next(which(name)) except StopIteration: try: # Special case for users of PyCharm on OS X. This is where Homebrew installs # it. It's hard to set PATH for PyCharm (or any GUI app) on OS X so let's # make it easy for those poor souls. return next(which(name, path=['/usr/local/sbin'])) except StopIteration: raise RuntimeError("Cannot find the '%s' binary. Make sure Mesos is installed " "and it's 'bin' directory is present on the PATH." % name)
def needs_gridengine(test_item): """ Use as a decorator before test classes or methods to only run them if GridEngine is installed. """ test_item = _mark_test('gridengine', test_item) if next(which('qsub'), None): return test_item else: return unittest.skip("Install GridEngine to include this test.")(test_item)
def needs_parasol(test_item): """ Use as decorator so tests are only run if Parasol is installed. """ test_item = _mark_test('parasol', test_item) if next(which('parasol'), None): return test_item else: return unittest.skip("Install Parasol to include this test.")(test_item)
def needs_slurm(test_item): """ Use as a decorator before test classes or methods to only run them if Slurm is installed. """ test_item = _mark_test('slurm', test_item) if next(which('squeue'), None): return test_item else: return unittest.skip("Install Slurm to include this test.")(test_item)
def main(): """ This Toil pipeline aligns reads and performs alternative splicing analysis. Please read the README.md located in the same directory for run instructions. """ # Define Parser object and add to toil url_prefix = 'https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/' parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--config', required=True, help='Path to configuration file for samples, one per line. UUID,URL_to_bamfile. ' 'The URL may be a standard "http://", a "file://<abs_path>", or "s3://<bucket>/<key>"') parser.add_argument('--gtf', help='URL to annotation GTF file', default=url_prefix + 'rnaseq_cgl/gencode.v23.annotation.gtf') parser.add_argument('--gtf-pickle', help='Pickled GTF file', default=url_prefix + 'spladder/gencode.v23.annotation.gtf.pickle') parser.add_argument('--gtf-m53', help='M53 preprocessing annotation table', default=url_prefix + 'spladder/gencode.v23.annotation.gtf.m53') parser.add_argument('--positions', help='URL to SNP positions over genes file (TSV)', default=url_prefix + 'spladder/positions_fixed.tsv') parser.add_argument('--genome', help='URL to Genome fasta', default=url_prefix + 'rnaseq_cgl/hg38_no_alt.fa') parser.add_argument('--genome-index', help='Index file (fai) of genome', default=url_prefix + 'spladder/hg38_no_alt.fa.fai') parser.add_argument('--ssec', default=None, help='Path to master key used for downloading encrypted files.') parser.add_argument('--output-s3-dir', default=None, help='S3 Directory of the form: s3://bucket/directory') parser.add_argument('--output-dir', default=None, help='full path where final results will be output') parser.add_argument('--sudo', action='store_true', default=False, help='Set flag if sudo is required to run Docker.') parser.add_argument('--star-index', help='URL to download STAR Index built from HG38/gencodev23 annotation.', default=url_prefix + 'rnaseq_cgl/starIndex_hg38_no_alt.tar.gz') parser.add_argument('--fwd-3pr-adapter', help="Sequence for the FWD 3' Read Adapter.", default='AGATCGGAAGAG') parser.add_argument('--rev-3pr-adapter', help="Sequence for the REV 3' Read Adapter.", default='AGATCGGAAGAG') Job.Runner.addToilOptions(parser) args = parser.parse_args() # Sanity Checks if args.config: assert os.path.isfile(args.config), 'Config not found at: {}'.format(args.config) if args.ssec: assert os.path.isfile(args.ssec), 'Encryption key not found at: {}'.format(args.config) if args.output_s3_dir: assert args.output_s3_dir.startswith('s3://'), 'Wrong format for output s3 directory' # Program checks for program in ['curl', 'docker']: assert which(program), 'Program "{}" must be installed on every node.'.format(program) Job.Runner.startToil(Job.wrapJobFn(parse_input_samples, args), args)
def needs_appliance(test_item): import json test_item = _mark_test('appliance', test_item) if next(which('docker'), None): image = applianceSelf() try: images = check_output(['docker', 'inspect', image]) except CalledProcessError: images = [] else: images = {i['Id'] for i in json.loads(images) if image in i['RepoTags']} if len(images) == 0: return unittest.skip("Cannot find appliance image %s. Be sure to run 'make docker' " "prior to running this test." % image)(test_item) elif len(images) == 1: return test_item else: assert False, 'Expected `docker inspect` to return zero or one image.' else: return unittest.skip('Install Docker to include this test.')(test_item)
def __init__(self, config, maxCores, maxMemory, maxDisk): super(ParasolBatchSystem, self).__init__(config, maxCores, maxMemory, maxDisk) if maxMemory != sys.maxint: logger.warn('The Parasol batch system does not support maxMemory.') # Keep the name of the results file for the pstat2 command.. command = config.parasolCommand if os.path.sep not in command: try: command = next(which(command)) except StopIteration: raise RuntimeError("Can't find %s on PATH." % command) logger.info('Using Parasol at %s', command) self.parasolCommand = command self.parasolResultsDir = tempfile.mkdtemp(dir=config.jobStore) # In Parasol, each results file corresponds to a separate batch, and all jobs in a batch # have the same cpu and memory requirements. The keys to this dictionary are the (cpu, # memory) tuples for each batch. A new batch is created whenever a job has a new unique # combination of cpu and memory requirements. self.resultsFiles = dict() self.maxBatches = config.parasolMaxBatches # Allows the worker process to send back the IDs of jobs that have finished, so the batch # system can decrease its used cpus counter self.cpuUsageQueue = Queue() # Also stores finished job IDs, but is read by getUpdatedJobIDs(). self.updatedJobsQueue = Queue() # Use this to stop the worker when shutting down self.running = True self.worker = Thread(target=self.updatedJobWorker, args=()) self.worker.start() self.usedCpus = 0 self.jobIDsToCpu = {} # Set of jobs that have been issued but aren't known to have finished or been killed yet. # Jobs that end by themselves are removed in getUpdatedJob, and jobs that are killed are # removed in killBatchJobs. self.runningJobs = set()
def needs_appliance(test_item): import json test_item = _mark_test('appliance', test_item) if next(which('docker'), None): image = applianceSelf() try: images = subprocess.check_output(['docker', 'inspect', image]) except subprocess.CalledProcessError: images = [] else: images = { i['Id'] for i in json.loads(images) if image in i['RepoTags'] } if len(images) == 0: return unittest.skip( "Cannot find appliance image %s. Be sure to run 'make docker' " "prior to running this test." % image)(test_item) elif len(images) == 1: return test_item else: assert False, 'Expected `docker inspect` to return zero or one image.' else: return unittest.skip('Install Docker to include this test.')(test_item)
def main(): """ GATK germline pipeline with variant filtering and annotation. """ # Define Parser object and add to jobTree parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawTextHelpFormatter) # Generate subparsers subparsers = parser.add_subparsers(dest='command') subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline') parser_run.add_argument( '--config', required=True, type=str, help='Path to the (filled in) config file, generated with ' '"generate-config".') parser_run.add_argument( '--manifest', type=str, help='Path to the (filled in) manifest file, generated with ' '"generate-manifest".\nDefault value: "%(default)s".') parser_run.add_argument( '--sample', default=None, nargs=2, type=str, help='Input sample identifier and BAM file URL or local path') parser_run.add_argument('--output-dir', default=None, help='Path/URL to output directory') parser_run.add_argument( '-s', '--suffix', default=None, help='Additional suffix to add to the names of the output files') parser_run.add_argument('--preprocess-only', action='store_true', help='Only runs preprocessing steps') Job.Runner.addToilOptions(parser_run) options = parser.parse_args() cwd = os.getcwd() if options.command == 'generate-config' or options.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config) if options.command == 'generate-manifest' or options.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest) elif options.command == 'run': # Program checks for program in ['curl', 'docker']: require( next(which(program)), program + ' must be installed on every node.'.format(program)) require( os.path.exists(options.config), '{} not found. Please run "generate-config"'.format( options.config)) # Read sample manifest samples = [] if options.manifest: samples.extend(parse_manifest(options.manifest)) # Add BAM sample from command line if options.sample: uuid, url = options.sample # samples tuple: (uuid, url, paired_url, rg_line) # BAM samples should not have as paired URL or read group line samples.append(GermlineSample(uuid, url, None, None)) require( len(samples) > 0, 'No samples were detected in the manifest or on the command line') # Parse inputs inputs = { x.replace('-', '_'): y for x, y in yaml.load(open(options.config).read()).iteritems() } required_fields = { 'genome_fasta', 'output_dir', 'run_bwa', 'sorted', 'snp_filter_annotations', 'indel_filter_annotations', 'preprocess', 'preprocess_only', 'run_vqsr', 'joint_genotype', 'run_oncotator', 'cores', 'file_size', 'xmx', 'suffix' } input_fields = set(inputs.keys()) require( input_fields > required_fields, 'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields))) if inputs['output_dir'] is None: inputs['output_dir'] = options.output_dir require(inputs['output_dir'] is not None, 'Missing output directory PATH/URL') if inputs['suffix'] is None: inputs['suffix'] = options.suffix if options.suffix else '' if inputs['preprocess_only'] is None: inputs['preprocess_only'] = options.preprocess_only if inputs['run_vqsr']: # Check that essential VQSR parameters are present vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'} require( input_fields > vqsr_fields, 'Missing parameters for VQSR:\n{}'.format( ', '.join(vqsr_fields - input_fields))) # Check that hard filtering parameters are present. If only running preprocessing steps, then we do # not need filtering information. elif not inputs['preprocess_only']: hard_filter_fields = { 'snp_filter_name', 'snp_filter_expression', 'indel_filter_name', 'indel_filter_expression' } require( input_fields > hard_filter_fields, 'Missing parameters for hard filtering:\n{}'.format( ', '.join(hard_filter_fields - input_fields))) # Check for falsey hard filtering parameters for hard_filter_field in hard_filter_fields: require( inputs[hard_filter_field], 'Missing %s value for hard filtering, ' 'got %s.' % (hard_filter_field, inputs[hard_filter_field])) # Set resource parameters inputs['xmx'] = human2bytes(inputs['xmx']) inputs['file_size'] = human2bytes(inputs['file_size']) inputs['cores'] = int(inputs['cores']) inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations']) # HaplotypeCaller test data for testing inputs['hc_output'] = inputs.get('hc_output', None) # It is a toil-scripts convention to store input parameters in a Namespace object config = argparse.Namespace(**inputs) root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config) Job.Runner.startToil(root, options)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz MarginPhase pipeline ======================================= Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the MarginPhase pipeline') group = parser_run.add_mutually_exclusive_group() parser_run.add_argument( '--config', default=DEFAULT_CONFIG_NAME, type=str, help= 'Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') group.add_argument( '--manifest', default=DEFAULT_MANIFEST_NAME, type=str, help= 'Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME), generate_manifest) # Pipeline execution elif args.command == 'run': # sanity check require( os.path.exists(args.config), '{} not found. Please run ' '"toil-marginphase generate-config"'.format(args.config)) require( os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "toil-marginphase generate-manifest"'.format(args.manifest)) # Parse config parsed_config = { x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems() } config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize config.defaultCores = int(min(MP_CPU, config.maxCores)) config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint config.maxMemory = sys.maxint # fix parsing of GB to int if args.maxMemory: args.maxMemory = args.maxMemory.upper() if args.maxMemory.endswith('B'): args.maxMemory = args.maxMemory.rstrip('B') # actual parsing if args.maxMemory.endswith('G'): config.maxMemory = int( args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024 elif args.maxMemory.endswith('M'): config.maxMemory = int( args.maxMemory.rstrip('M')) * 1024 * 1024 elif args.maxMemory.endswith('K'): config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024 else: config.maxMemory = int(args.maxMemory) # Config sanity checks require(config.output_dir, 'No output location specified') if urlparse(config.output_dir).scheme != "s3": config.output_dir = config.output_dir.replace("file://", "", 1) mkdir_p(config.output_dir) if not config.output_dir.endswith('/'): config.output_dir += '/' require(config.partition_size, "Configuration parameter partition-size is required") require(config.partition_margin, "Configuration parameter partition-margin is required") if 'save_intermediate_files' not in config or not config.save_intermediate_files: config.intermediate_file_location = None elif urlparse(config.output_dir).scheme == "s3": raise UserError( "Config parameter 'save_intermediate_files' cannot be used with s3 output directory" ) else: intermediate_location = os.path.join( config.output_dir, "intermediate", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) mkdir_p(intermediate_location) config.intermediate_file_location = intermediate_location if "margin_phase_image" not in config or len( config.margin_phase_image) == 0: config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT if "margin_phase_tag" not in config or len( config.margin_phase_tag) == 0: config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT if "cpecan_image" not in config or len(config.cpecan_image) == 0: config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT if "cpecan_tag" not in config or len(config.cpecan_tag) == 0: config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT if "unittest" not in config: config.unittest = False if "minimal_output" not in config: config.minimal_output = False if "minimal_cpecan_output" not in config: config.minimal_cpecan_output = False if "cpecan_probabilities" not in config: config.cpecan_probabilities = False # get samples samples = parse_samples(config, args.manifest) # Program checks for program in ['docker']: require( next(which(program), None), program + ' must be installed on every node.'.format(program)) # Start the workflow Job.Runner.startToil( Job.wrapJobFn(map_job, prepare_input, samples, config), args)
def main(): """ This Toil pipeline aligns reads and performs alternative splicing analysis. Please read the README.md located in the same directory for run instructions. """ # Define Parser object and add to toil url_prefix = 'https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/' parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( '--config', required=True, help= 'Path to configuration file for samples, one per line. UUID,URL_to_bamfile. ' 'The URL may be a standard "http://", a "file://<abs_path>", or "s3://<bucket>/<key>"' ) parser.add_argument('--gtf', help='URL to annotation GTF file', default=url_prefix + 'rnaseq_cgl/gencode.v23.annotation.gtf') parser.add_argument('--gtf-pickle', help='Pickled GTF file', default=url_prefix + 'spladder/gencode.v23.annotation.gtf.pickle') parser.add_argument('--gtf-m53', help='M53 preprocessing annotation table', default=url_prefix + 'spladder/gencode.v23.annotation.gtf.m53') parser.add_argument('--positions', help='URL to SNP positions over genes file (TSV)', default=url_prefix + 'spladder/positions_fixed.tsv') parser.add_argument('--genome', help='URL to Genome fasta', default=url_prefix + 'rnaseq_cgl/hg38_no_alt.fa') parser.add_argument('--genome-index', help='Index file (fai) of genome', default=url_prefix + 'spladder/hg38_no_alt.fa.fai') parser.add_argument( '--ssec', default=None, help='Path to master key used for downloading encrypted files.') parser.add_argument('--output-s3-dir', default=None, help='S3 Directory of the form: s3://bucket/directory') parser.add_argument('--output-dir', default=None, help='full path where final results will be output') parser.add_argument('--sudo', action='store_true', default=False, help='Set flag if sudo is required to run Docker.') parser.add_argument( '--star-index', help= 'URL to download STAR Index built from HG38/gencodev23 annotation.', default=url_prefix + 'rnaseq_cgl/starIndex_hg38_no_alt.tar.gz') parser.add_argument('--fwd-3pr-adapter', help="Sequence for the FWD 3' Read Adapter.", default='AGATCGGAAGAG') parser.add_argument('--rev-3pr-adapter', help="Sequence for the REV 3' Read Adapter.", default='AGATCGGAAGAG') Job.Runner.addToilOptions(parser) args = parser.parse_args() # Sanity Checks if args.config: assert os.path.isfile(args.config), 'Config not found at: {}'.format( args.config) if args.ssec: assert os.path.isfile( args.ssec), 'Encryption key not found at: {}'.format(args.config) if args.output_s3_dir: assert args.output_s3_dir.startswith( 's3://'), 'Wrong format for output s3 directory' # Program checks for program in ['curl', 'docker']: assert which( program), 'Program "{}" must be installed on every node.'.format( program) Job.Runner.startToil(Job.wrapJobFn(parse_input_samples, args), args)
def main(): """ GATK germline pipeline with variant filtering and annotation. """ # Define Parser object and add to jobTree parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) # Generate subparsers subparsers = parser.add_subparsers(dest='command') subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the GATK germline pipeline') parser_run.add_argument('--config', required=True, type=str, help='Path to the (filled in) config file, generated with ' '"generate-config".') parser_run.add_argument('--manifest', type=str, help='Path to the (filled in) manifest file, generated with ' '"generate-manifest".\nDefault value: "%(default)s".') parser_run.add_argument('--sample', default=None, nargs=2, type=str, help='Input sample identifier and BAM file URL or local path') parser_run.add_argument('--output-dir', default=None, help='Path/URL to output directory') parser_run.add_argument('-s', '--suffix', default=None, help='Additional suffix to add to the names of the output files') parser_run.add_argument('--preprocess-only', action='store_true', help='Only runs preprocessing steps') Job.Runner.addToilOptions(parser_run) options = parser.parse_args() cwd = os.getcwd() if options.command == 'generate-config' or options.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-germline.yaml'), generate_config) if options.command == 'generate-manifest' or options.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-germline.tsv'), generate_manifest) elif options.command == 'run': # Program checks for program in ['curl', 'docker']: require(next(which(program)), program + ' must be installed on every node.'.format(program)) require(os.path.exists(options.config), '{} not found. Please run "generate-config"'.format(options.config)) # Read sample manifest samples = [] if options.manifest: samples.extend(parse_manifest(options.manifest)) # Add BAM sample from command line if options.sample: uuid, url = options.sample # samples tuple: (uuid, url, paired_url, rg_line) # BAM samples should not have as paired URL or read group line samples.append(GermlineSample(uuid, url, None, None)) require(len(samples) > 0, 'No samples were detected in the manifest or on the command line') # Parse inputs inputs = {x.replace('-', '_'): y for x, y in yaml.load(open(options.config).read()).iteritems()} required_fields = {'genome_fasta', 'output_dir', 'run_bwa', 'sorted', 'snp_filter_annotations', 'indel_filter_annotations', 'preprocess', 'preprocess_only', 'run_vqsr', 'joint_genotype', 'run_oncotator', 'cores', 'file_size', 'xmx', 'suffix'} input_fields = set(inputs.keys()) require(input_fields > required_fields, 'Missing config parameters:\n{}'.format(', '.join(required_fields - input_fields))) if inputs['output_dir'] is None: inputs['output_dir'] = options.output_dir require(inputs['output_dir'] is not None, 'Missing output directory PATH/URL') if inputs['suffix'] is None: inputs['suffix'] = options.suffix if options.suffix else '' if inputs['preprocess_only'] is None: inputs['preprocess_only'] = options.preprocess_only if inputs['run_vqsr']: # Check that essential VQSR parameters are present vqsr_fields = {'g1k_snp', 'mills', 'dbsnp', 'hapmap', 'omni'} require(input_fields > vqsr_fields, 'Missing parameters for VQSR:\n{}'.format(', '.join(vqsr_fields - input_fields))) # Check that hard filtering parameters are present. If only running preprocessing steps, then we do # not need filtering information. elif not inputs['preprocess_only']: hard_filter_fields = {'snp_filter_name', 'snp_filter_expression', 'indel_filter_name', 'indel_filter_expression'} require(input_fields > hard_filter_fields, 'Missing parameters for hard filtering:\n{}'.format(', '.join(hard_filter_fields - input_fields))) # Check for falsey hard filtering parameters for hard_filter_field in hard_filter_fields: require(inputs[hard_filter_field], 'Missing %s value for hard filtering, ' 'got %s.' % (hard_filter_field, inputs[hard_filter_field])) # Set resource parameters inputs['xmx'] = human2bytes(inputs['xmx']) inputs['file_size'] = human2bytes(inputs['file_size']) inputs['cores'] = int(inputs['cores']) inputs['annotations'] = set(inputs['snp_filter_annotations'] + inputs['indel_filter_annotations']) # HaplotypeCaller test data for testing inputs['hc_output'] = inputs.get('hc_output', None) # It is a toil-scripts convention to store input parameters in a Namespace object config = argparse.Namespace(**inputs) root = Job.wrapJobFn(run_gatk_germline_pipeline, samples, config) Job.Runner.startToil(root, options)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Toil exome pipeline Perform variant / indel analysis given a pair of tumor/normal BAM files. Samples are optionally preprocessed (indel realignment and base quality score recalibration) The output of this pipeline is a tarball containing results from MuTect, MuSe, and Pindel. General usage: 1. Type "toil-exome generate" to create an editable manifest and config in the current working directory. 2. Parameterize the pipeline by editing the config. 3. Fill in the manifest with information pertaining to your samples. 4. Type "toil-exome run [jobStore]" to execute the pipeline. Please read the README.md located in the source directory or at: https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/exome_variant_pipeline Structure of variant pipeline (per sample) 1 2 3 4 14 ------- | | | | | | 0 --------- 5 ----- 15 -------- 17 | | | --- 16 ------- | | 6 7 | | 8 9 | | 10 11 | | 12 13 0 = Start node 1 = reference index 2 = reference dict 3 = normal bam index 4 = tumor bam index 5 = pre-processing node / DAG declaration 6,7 = RealignerTargetCreator 8,9 = IndelRealigner 10,11 = BaseRecalibration 12,13 = PrintReads 14 = MuTect 15 = Pindel 16 = MuSe 17 = Consolidate Output and move/upload results ================================================== Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the CGL exome pipeline') parser_run.add_argument('--config', default='config-toil-exome.yaml', type=str, help='Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') parser_run.add_argument('--manifest', default='manifest-toil-exome.tsv', type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') parser_run.add_argument('--normal', default=None, type=str, help='URL for the normal BAM. URLs can take the form: http://, ftp://, file://, s3://, ' 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.') parser_run.add_argument('--tumor', default=None, type=str, help='URL for the tumor BAM. URLs can take the form: http://, ftp://, file://, s3://, ' 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.') parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the' '"--tumor" and "--normal" option') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-exome.yaml'), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'), generate_manifest) # Pipeline execution elif args.command == 'run': require(os.path.exists(args.config), '{} not found. Please run ' '"toil-rnaseq generate-config"'.format(args.config)) if args.normal or args.tumor or args.uuid: require(args.normal and args.tumor and args.uuid, '"--tumor", "--normal" and "--uuid" must all be supplied') samples = [[args.uuid, args.normal, args.tumor]] else: samples = parse_manifest(args.manifest) # Parse config parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint # Exome pipeline sanity checks if config.preprocessing: require(config.reference and config.phase and config.mills and config.dbsnp, 'Missing inputs for preprocessing, check config file.') if config.run_mutect: require(config.reference and config.dbsnp and config.cosmic, 'Missing inputs for MuTect, check config file.') if config.run_pindel: require(config.reference, 'Missing input (reference) for Pindel.') if config.run_muse: require(config.reference and config.dbsnp, 'Missing inputs for MuSe, check config file.') require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) # Program checks for program in ['curl', 'docker']: require(next(which(program), None), program + ' must be installed on every node.'.format(program)) # Launch Pipeline Job.Runner.startToil(Job.wrapJobFn(download_shared_files, samples, config), args)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Toil RNA-seq pipeline RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto) General usage: 1. Type "toil-rnaseq generate" to create an editable manifest and config in the current working directory. 2. Parameterize the pipeline by editing the config. 3. Fill in the manifest with information pertaining to your samples. 4. Type "toil-rnaseq run [jobStore]" to execute the pipeline. Please read the README.md located in the source directory or at: https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/rnaseq_cgl Structure of RNA-Seq Pipeline (per sample) 8 | 3 -- 4 -- 5 / | 0 -- 1 -- 2 ---- 7 -- 9 | | 6 ----------- 0 = Download sample 1 = Unpack/Merge fastqs 2 = CutAdapt (adapter trimming) 3 = STAR Alignment 4 = RSEM Quantification 5 = RSEM Post-processing 6 = FastQC 7 = Kallisto 8 = BamQC (as specified by CKCC at UC Santa Cruz) 9 = Consoliate output and upload to S3 ======================================= Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL, needed for upload to S3) """ parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the RNA-seq pipeline') group = parser_run.add_mutually_exclusive_group() parser_run.add_argument( '--config', default='config-toil-rnaseq.yaml', type=str, help= 'Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') group.add_argument( '--manifest', default='manifest-toil-rnaseq.tsv', type=str, help= 'Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') group.add_argument( '--samples', default=None, nargs='+', type=str, help= 'Space delimited sample URLs (any number). Samples must be tarfiles/tarballs that contain ' 'fastq files. URLs follow the format: http://foo.com/sample.tar, ' 'file:///full/path/to/file.tar. The UUID for the sample will be derived from the file.' 'Samples passed in this way will be assumed to be paired end, if using single-end data, ' 'please use the manifest option.') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-rnaseq.yaml'), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-rnaseq.tsv'), generate_manifest) # Pipeline execution elif args.command == 'run': require( os.path.exists(args.config), '{} not found. Please run ' '"toil-rnaseq generate-config"'.format(args.config)) if not args.samples: require( os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "toil-rnaseq generate-manifest"'.format(args.manifest)) samples = parse_samples(path_to_manifest=args.manifest) else: samples = parse_samples(sample_urls=args.samples) # Parse config parsed_config = { x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems() } config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint # Config sanity checks require( config.kallisto_index or config.star_index, 'URLs not provided for Kallisto or STAR, so there is nothing to do!' ) if config.star_index or config.rsem_ref: require( config.star_index and config.rsem_ref, 'Input provided for STAR or RSEM but not both. STAR: ' '{}, RSEM: {}'.format(config.star_index, config.rsem_ref)) require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) for input in [ x for x in [config.kallisto_index, config.star_index, config.rsem_ref] if x ]: require( urlparse(input).scheme in schemes, 'Input in config must have the appropriate URL prefix: {}'. format(schemes)) if not config.output_dir.endswith('/'): config.output_dir += '/' # Program checks for program in ['curl', 'docker']: require( next(which(program), None), program + ' must be installed on every node.'.format(program)) # Start the workflow, calling map_job() to run the pipeline for each sample with Toil(args) as toil: toil.start(Job.wrapJobFn(map_job, download_sample, samples, config))
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Toil RNA-seq pipeline RNA-seq fastqs are combined, aligned, and quantified with 2 different methods (RSEM and Kallisto) General usage: 1. Type "toil-rnaseq generate" to create an editable manifest and config in the current working directory. 2. Parameterize the pipeline by editing the config. 3. Fill in the manifest with information pertaining to your samples. 4. Type "toil-rnaseq run [jobStore]" to execute the pipeline. Please read the README.md located in the source directory or at: https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/rnaseq_cgl Structure of RNA-Seq Pipeline (per sample) 3 -- 4 -- 5 / | 0 -- 1 -- 2 ---- 6 -- 7 0 = Download sample 1 = Unpack/Merge fastqs 2 = CutAdapt (adapter trimming) 3 = STAR Alignment 4 = RSEM Quantification 5 = RSEM Post-processing 6 = Kallisto 7 = Consoliate output and upload to S3 ======================================= Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the RNA-seq pipeline') group = parser_run.add_mutually_exclusive_group(required=True) parser_run.add_argument('--config', default='config-toil-rnaseq.yaml', type=str, help='Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') group.add_argument('--manifest', default='manifest-toil-rnaseq.tsv', type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') group.add_argument('--samples', default=None, nargs='+', type=str, help='Space delimited sample URLs (any number). Samples must be tarfiles/tarballs that contain ' 'fastq files. URLs follow the format: http://foo.com/sample.tar, ' 'file:///full/path/to/file.tar. The UUID for the sample will be derived from the file.' 'Samples passed in this way will be assumed to be paired end, if using single-end data, ' 'please use the manifest option.') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-rnaseq.yaml'), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-rnaseq.tsv'), generate_manifest) # Pipeline execution elif args.command == 'run': require(os.path.exists(args.config), '{} not found. Please run ' '"toil-rnaseq generate-config"'.format(args.config)) if not args.samples: require(os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "toil-rnaseq generate-manifest"'.format(args.manifest)) samples = parse_samples(path_to_manifest=args.manifest) else: samples = parse_samples(sample_urls=args.samples) # Parse config parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint # Config sanity checks require(config.kallisto_index or config.star_index, 'URLs not provided for Kallisto or STAR, so there is nothing to do!') if config.star_index or config.rsem_ref: require(config.star_index and config.rsem_ref, 'Input provided for STAR or RSEM but not both. STAR: ' '{}, RSEM: {}'.format(config.star_index, config.rsem_ref)) require(config.output_dir or config.s3_output_dir, 'output-dir AND/OR s3-output-dir need to be defined, ' 'otherwise sample output is not stored anywhere!') for input in [x for x in [config.kallisto_index, config.star_index, config.rsem_ref] if x]: require(urlparse(input).scheme in schemes, 'Input in config must have the appropriate URL prefix: {}'.format(schemes)) # Program checks for program in ['curl', 'docker']: require(next(which(program), None), program + ' must be installed on every node.'.format(program)) # Start the workflow by using map_job() to run the pipeline for each sample Job.Runner.startToil(Job.wrapJobFn(map_job, download_sample, samples, config), args)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Toil exome pipeline Perform variant / indel analysis given a pair of tumor/normal BAM files. Samples are optionally preprocessed (indel realignment and base quality score recalibration) The output of this pipeline is a tarball containing results from MuTect, MuSe, and Pindel. General usage: 1. Type "toil-exome generate" to create an editable manifest and config in the current working directory. 2. Parameterize the pipeline by editing the config. 3. Fill in the manifest with information pertaining to your samples. 4. Type "toil-exome run [jobStore]" to execute the pipeline. Please read the README.md located in the source directory or at: https://github.com/BD2KGenomics/toil-scripts/tree/master/src/toil_scripts/exome_variant_pipeline Structure of variant pipeline (per sample) 1 2 3 4 14 ------- | | | | | | 0 --------- 5 ----- 15 -------- 17 | | | --- 16 ------- | | 6 7 | | 8 9 | | 10 11 | | 12 13 0 = Start node 1 = reference index 2 = reference dict 3 = normal bam index 4 = tumor bam index 5 = pre-processing node / DAG declaration 6,7 = RealignerTargetCreator 8,9 = IndelRealigner 10,11 = BaseRecalibration 12,13 = PrintReads 14 = MuTect 15 = Pindel 16 = MuSe 17 = Consolidate Output and move/upload results ================================================== Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the CGL exome pipeline') parser_run.add_argument( '--config', default='config-toil-exome.yaml', type=str, help= 'Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') parser_run.add_argument( '--manifest', default='manifest-toil-exome.tsv', type=str, help= 'Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') parser_run.add_argument( '--normal', default=None, type=str, help= 'URL for the normal BAM. URLs can take the form: http://, ftp://, file://, s3://, ' 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.' ) parser_run.add_argument( '--tumor', default=None, type=str, help= 'URL for the tumor BAM. URLs can take the form: http://, ftp://, file://, s3://, ' 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.' ) parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the' '"--tumor" and "--normal" option') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, 'config-toil-exome.yaml'), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'), generate_manifest) # Pipeline execution elif args.command == 'run': require( os.path.exists(args.config), '{} not found. Please run ' '"toil-rnaseq generate-config"'.format(args.config)) if args.normal or args.tumor or args.uuid: require(args.normal and args.tumor and args.uuid, '"--tumor", "--normal" and "--uuid" must all be supplied') samples = [[args.uuid, args.normal, args.tumor]] else: samples = parse_manifest(args.manifest) # Parse config parsed_config = { x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems() } config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint # Exome pipeline sanity checks if config.preprocessing: require( config.reference and config.phase and config.mills and config.dbsnp, 'Missing inputs for preprocessing, check config file.') if config.run_mutect: require(config.reference and config.dbsnp and config.cosmic, 'Missing inputs for MuTect, check config file.') if config.run_pindel: require(config.reference, 'Missing input (reference) for Pindel.') if config.run_muse: require(config.reference and config.dbsnp, 'Missing inputs for MuSe, check config file.') require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) # Program checks for program in ['curl', 'docker']: require( next(which(program), None), program + ' must be installed on every node.'.format(program)) # Launch Pipeline Job.Runner.startToil( Job.wrapJobFn(download_shared_files, samples, config), args)
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz Toil RNA-seq single cell pipeline ======================================= Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser( 'run', help='Runs the RNA-seq single cell pipeline') group = parser_run.add_mutually_exclusive_group() parser_run.add_argument( '--config', default=DEFAULT_CONFIG_NAME, type=str, help= 'Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') group.add_argument( '--manifest', default=DEFAULT_MANIFEST_NAME, type=str, help= 'Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME), generate_manifest) # Pipeline execution elif args.command == 'run': # sanity check require( os.path.exists(args.config), '{} not found. Please run ' '"toil-rnaseq generate-config"'.format(args.config)) require( os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "toil-rnaseq generate-manifest"'.format(args.manifest)) # get samples samples = parse_samples(path_to_manifest=args.manifest) # Parse config parsed_config = { x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems() } config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxint # Config sanity checks require( config.kallisto_index, 'URLs not provided for Kallisto index, so there is nothing to do!') require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) require( urlparse(config.kallisto_index).scheme in SCHEMES, 'Kallisto index in config must have the appropriate URL prefix: {}' .format(SCHEMES)) if not config.output_dir.endswith('/'): config.output_dir += '/' # Program checks for program in ['curl', 'docker']: require( next(which(program), None), program + ' must be installed on every node.'.format(program)) # Start the workflow Job.Runner.startToil( Job.wrapJobFn(map_job, run_single_cell, samples, config), args)