def upload_data(masterIP, inputs, hdfsName, uploadName, sparkOnToil): """ Upload file hdfsName from hdfs to s3 """ if mock_mode(): truncate_file(masterIP, hdfsName, sparkOnToil) log.info("Uploading output BAM %s to %s.", hdfsName, uploadName) call_conductor(masterIP, inputs, hdfsName, uploadName)
def upload_data(master_ip, inputs, hdfs_name, upload_name, spark_on_toil): """ Upload file hdfsName from hdfs to s3 """ if mock_mode(): truncate_file(master_ip, hdfs_name, spark_on_toil) log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name) call_conductor(master_ip, hdfs_name, upload_name, memory=inputs['memory'])
def generate_config(): if mock_mode(): return generate_mock_config() return textwrap.dedent(""" # ADAM/GATK Pipeline configuration file # This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon. # Edit the values in this configuration file and then rerun the pipeline # Comments (beginning with #) do not need to be removed. Optional parameters may be left blank. ############################################################################################################## pipeline-to-run: both # skip-alignment: False # skip-preprocessing: False # sequence-dir: sequence # autoscale-cluster: False # s3-bucket: # S3 Bucket URI cpu-count: # Optional: program-unit: 12345 # platform: ILLUMINA # ref: # Required: Reference fasta file amb: # Required: Reference fasta file (amb) ann: # Required: Reference fasta file (ann) bwt: # Required: Reference fasta file (bwt) pac: # Required: Reference fasta file (pac) sa: # Required: Reference fasta file (sa) fai: # Required: Reference fasta file (fai) alt: # Optional: Alternate file for reference build (alt). Necessary for alt aware alignment. phase: # Required: URL (1000G_phase1.indels.hg19.sites.fixed.vcf) mills: # Required: URL (Mills_and_1000G_gold_standard.indels.hg19.sites.vcf) dbsnp: # Required: URL (dbsnp_132_b37.leftAligned.vcf) hapmap: # Required: URL (hapmap_3.3.b37.vcf) omni: # Required: URL (1000G_omni.5.b37.vcf) trim-adapters: False # Trim adapters. num-nodes: 9 # Number of nodes to use. Do not set if providing master_ip. master-ip: # Optional: IP or hostname of host running for Spark master and HDFS namenode. # Should be provided instead of num-nodes if pointing at a static (external or # standalone) Spark cluster. The special value 'auto' indicates the master of # an externally autoscaled cgcloud spark cluster, i.e. one that is managed by # the uberscript. file-size: 100G # Approximate input file size. Should be given as %d[TGMK], e.g., # for a 100 gigabyte file, use file_size: '100G' ssec: # Optional: (string) Path to Key File for SSE-C Encryption dir-suffix: # Optional: suffix to add to output directory names. memory: # Required: Amount of available memory on each worker node. """[1:])
def generate_manifest(): if mock_mode(): return generate_mock_manifest() return textwrap.dedent(""" # Edit this manifest to include information pertaining to each sample to be run. # There is a single column: UUID # # UUID This should be a unique identifier for the sample to be processed that corresponds to # the prefix of the filenames of the input fastq files. # # Example: # If your input fastq file pairs were input_file_name_1.illumina_1.fastq.gz, input_file_name_1.illumina_2.fastq.gz and # input_file_name_2.illumina_1.fastq.gz, input_file_name_2.illumina_2.fastq.gz, the manifest would be: # # input_file_name_1.illumina # input_file_name_2.illumina # # Input fastq files MUST be named according to the filename_1.fastq.gz, filename_2.fastq.gz convention # # Place your samples below, one per line. """[1:])
def launch_pipeline(params): """ Launches pipeline in a screen session on toil-leader. :param argparse.Namespace params: parsed command line arguments and options """ if not params.jobstore: jobstore = '{}-{}'.format(uuid4(), str(datetime.utcnow().date())) else: jobstore = params.jobstore restart = '--restart' if params.restart else '' log.info('Launching Pipeline and blocking. Check log.txt on leader for stderr and stdout') try: # Create screen session check_call(['cgcloud', 'ssh', '--zone', params.zone, '--cluster-name', params.cluster_name, 'toil-leader', '-o', 'StrictHostKeyChecking=no', 'screen', '-dmS', params.cluster_name]) if params.reference_genome == 'GRCh38': from toil_scripts.adam_uberscript.input_files import GRCh38_inputs as inputs elif params.reference_genome == 'hg19': from toil_scripts.adam_uberscript.input_files import hg19_inputs as inputs else: assert False, 'Invalid ref genome %s' % params.reference_genome # Assemble pipeline command to be stuffed into a screen session pipeline_command = ['PYTHONPATH=$PYTHONPATH:~/toil-scripts/src', 'python -m toil_scripts.adam_gatk_pipeline.align_and_call', 'aws:{region}:{j}', '--autoscale_cluster', '--retryCount 1', '--use_bwakit', '--driver_memory {m}', '--executor_memory {m}', '--batchSystem mesos', '--mesosMaster $(hostname -i):5050', '--workDir /var/lib/toil', '--logInfo'] if mock_mode(): pipeline_command = ['ADAM_GATK_MOCK_MODE=1'] + \ pipeline_command + \ ['--dir_suffix /mock'] else: pipeline_command += ['--s3_bucket {b}', '--bucket_region {region}', '--sequence_dir {sequence_dir}', '--dir_suffix /{genome}', '--uuid_manifest ~/manifest', '--ref {ref}', '--amb {amb}', '--ann {ann}', '--bwt {bwt}', '--pac {pac}', '--sa {sa}', '--fai {fai}', '--phase {phase}', '--mills {mills}', '--dbsnp {dbsnp}', '--omni {omni}', '--hapmap {hapmap}', '--file_size {fs}'] if 'alt' in inputs: pipeline_command.append('--alt {alt}') # Do we have a defined master IP? if params.master_ip: pipeline_command.append('--master_ip %s' % params.master_ip) elif params.spark_nodes: pipeline_command.append('--num_nodes %s' % params.spark_nodes) pipeline_command.append('{r} 2>&1 | tee toil_output\n') pipeline_command = ' '.join(pipeline_command) pipeline_command = pipeline_command.format(j=jobstore, b=params.bucket, region=region_of_zone(params.zone), m=params.memory, fs=params.file_size, r=restart, sequence_dir=params.sequence_dir, genome=params.reference_genome, **inputs) chunk_size = 500 for chunk in [pipeline_command[i:i + chunk_size] for i in range(0, len(pipeline_command), chunk_size)]: check_call(['cgcloud', 'ssh', '--zone', params.zone, '--cluster-name', params.cluster_name, 'toil-leader', '-o', 'StrictHostKeyChecking=no', 'screen', '-S', params.cluster_name, '-X', 'stuff', quote(chunk)]) except CalledProcessError as e: log.info('Pipeline exited with non-zero status code: {}'.format(e))
def main(): parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Launch cluster cluster_sp = subparsers.add_parser('launch-cluster', help='Launches EC2 cluster via CGCloud') cluster_sp.add_argument('-S', '--share', required=True, help='Full path to directory containing pipeline script, launch script, and master key.') cluster_sp.add_argument('-T', '--leader-type', default='m3.medium', help='Sets leader instance type.') cluster_sp.add_argument('-b', '--boto-path', default='/home/mesosbox/.boto', type=str, help='Path to local .boto file to be placed on leader.') cluster_sp.add_argument('-M', '--manifest-path', default = None if not mock_mode() else "/home/ubuntu/toil-scripts/src/toil_scripts/adam_gatk_pipeline/mock_manifest", required=not mock_mode(), help='Path to manifest file.') # Launch pipeline pipeline_sp = subparsers.add_parser('launch-pipeline', help='Launches pipeline') pipeline_sp.add_argument('-j', '--jobstore', default=None, help='Name of jobstore. Defaults to UUID-Date if not set') pipeline_sp.add_argument('--restart', default=None, action='store_true', help='Attempts to restart pipeline, requires existing job store.') pipeline_sp.add_argument('--master_ip', default=None, help="The address of an external Spark master or 'auto' when using a standalone Spark " "cluster managed by this script. In that latter case you must pass the " "--max-samples-on-spark option to the launch-metric command.") pipeline_sp.add_argument('-B', '--bucket', help='The name of the destination bucket.') pipeline_sp.add_argument('-m', '--memory', default='200g' if not mock_mode() else '3g', help='The amount of memory per worker node in GiB. Must match what EC2 provides on ' 'the specified worker instance type. Defaults to 3 in mock mode') pipeline_sp.add_argument('-f', '--file_size', default='100G' if not mock_mode() else '10M', help='Approximate size of the BAM files. Defaults to 10M in mock mode') pipeline_sp.add_argument('-s', '--spark_nodes', type=int, default=(8 if not mock_mode() else 2) + 1, help="The number of Spark nodes, including the master, to allocate per sample. Relevant " "with separate running against a standSpark cluster managed, the master will be " "shared by all samples and the actual number of workers allocated per sample will be " "one less than specified here. Otherwise, each sample's subcluster will get its own " "master node. Default 9 in production mode, 3 in mock mode.") pipeline_sp.add_argument('-SD', '--sequence_dir', default='sequence', help='Directory where raw sequences are.') pipeline_sp.add_argument('-R', '--reference_genome', default='GRCh38', choices=['GRCh38', 'hg19'], help='Reference genome to align and call against. Choose between GRCh38 and hg19.') # Launch metric collection metric_sp = subparsers.add_parser('launch-metrics', help='Launches metric collection thread') metric_sp.add_argument('-j', '--jobstore', required=True, # differs subtly from launch-pipeline's --jobstore help='Name of jobstore') metric_sp.add_argument('--namespace', default=os.environ.get('CGCLOUD_NAMESPACE', '/'), help='CGCloud NameSpace') metric_sp.add_argument('--spark-sample-slots', required=False, default=0, type=int, help='The maximum number of samples to be computed concurrently on a standalone Spark ' 'cluster managed by this script. To be used in conjunction with the --master_ip=auto ' 'option of the launch-cluster command. The default of 0 disables the standalone Spark ' 'cluster.') # Common options cgcloud_zone = os.environ.get('CGCLOUD_ZONE') for sp in cluster_sp, pipeline_sp, metric_sp: sp.add_argument('-c', '--cluster-name', required=True, help='The CGCloud cluster name for Toil leader and workers.') sp.add_argument('-z', '--zone', required=cgcloud_zone is None, default=cgcloud_zone, help="The EC2 availability zone in which to place on-demand instances like the leaders of the " "Toil and standalone Spark clusters. Also determines the region of the S3 bucket and SDB " "domain for Toil's job store. The availability zone for spot instances may be chosen " "independently from all zones in the region containing the specified zone.") for sp in cluster_sp, metric_sp: sp.add_argument('-t', '--instance-type', default='r3.8xlarge' if not mock_mode() else 'c3.large', help='Worker instance type, e.g. m4.large or c3.8xlarge. Defaults to r3.8xlarge in production ' 'mode. Will always use c3.large in mock mode, regardless of input value.') sp.add_argument('--spot-price', default=None, required=False, help='Instance spot price if desired.') for sp in metric_sp, cluster_sp: sp.add_argument('-etc', '--add-to-etc-hosts', default=None, required=False, help='Deprecated. Optional entry to add to /etc/hosts on Toil workers. This should *not* be ' 'used to communicate the address of a standalone Spark master to driver jobs running on ' 'Toil nodes. Use --master_ip=auto instead.') params = parser.parse_args() if params.command == 'launch-pipeline' and mock_mode() and params.master_ip: params.spark_sample_slots = 1 if params.command == 'launch-cluster': launch_cluster(params) place_boto_on_leader(params) elif params.command == 'launch-pipeline': launch_pipeline(params) elif params.command == 'launch-metrics': manage_metrics_and_cluster_scaling(params)
def main(): """ This is a Toil pipeline used to perform alignment of fastqs. """ # Define Parser object and add to Toil if mock_mode(): usage_msg = 'You have the TOIL_SCRIPTS_MOCK_MODE environment variable set, so this pipeline ' \ 'will run in mock mode. To disable mock mode, set TOIL_SCRIPTS_MOCK_MODE=0' else: usage_msg = None parser = argparse.ArgumentParser(usage=usage_msg) subparsers = parser.add_subparsers(dest='command') subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the ADAM/GATK pipeline') default_config = 'adam-gatk-mock.config' if mock_mode() else 'adam-gatk.config' default_manifest = 'adam-gatk-mock-manifest.csv' if mock_mode() else 'adam-gatk-manifest.csv' parser_run.add_argument('--config', default=default_config, type=str, help='Path to the (filled in) config file, generated with "generate-config".') parser_run.add_argument('--manifest', default=default_manifest, type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s".') Job.Runner.addToilOptions(parser_run) args = parser.parse_args() cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, default_config), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, default_manifest), generate_manifest) # Pipeline execution elif args.command == 'run': require(os.path.exists(args.config), '{} not found. Please run ' 'generate-config'.format(args.config)) if not hasattr(args, 'sample'): require(os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "generate-manifest"'.format(args.manifest)) # Parse config parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} inputs = argparse.Namespace(**parsed_config) # Parse manifest file uuid_list = [] with open(args.manifest) as f_manifest: for line in f_manifest: if not line.isspace() and not line.startswith('#'): uuid_list.append(line.strip()) inputs.sort = False if not inputs.dir_suffix: inputs.dir_suffix = '' if not inputs.s3_bucket: inputs.s3_bucket = '' if inputs.master_ip and inputs.num_nodes: raise ValueError("Exactly one of master_ip (%s) and num_nodes (%d) must be provided." % (inputs.master_ip, inputs.num_nodes)) if not hasattr(inputs, 'master_ip') and inputs.num_nodes <= 1: raise ValueError('num_nodes allocates one Spark/HDFS master and n-1 workers, and thus must be greater ' 'than 1. %d was passed.' % inputs.num_nodes) if (inputs.pipeline_to_run != "adam" and inputs.pipeline_to_run != "gatk" and inputs.pipeline_to_run != "both"): raise ValueError("pipeline_to_run must be either 'adam', 'gatk', or 'both'. %s was passed." % inputs.pipeline_to_run) Job.Runner.startToil(Job.wrapJobFn(sample_loop, uuid_list, inputs), args)