def upload_data(job, master_ip, inputs, hdfs_name, upload_name, spark_on_toil): """ Upload file hdfsName from hdfs to s3 """ if mock_mode(): truncate_file(master_ip, hdfs_name, spark_on_toil) log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name) call_conductor(job, master_ip, hdfs_name, upload_name, memory=inputs.memory) remove_file(master_ip, hdfs_name, spark_on_toil)
def upload_data(job, master_ip, inputs, hdfs_name, upload_name, spark_on_toil): """ Upload file hdfsName from hdfs to s3 """ if mock_mode(): truncate_file(master_ip, hdfs_name, spark_on_toil) log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name) call_conductor(job, master_ip, hdfs_name, upload_name, memory=inputs.memory) remove_file(master_ip, hdfs_name, spark_on_toil)
def generate_config(): if mock_mode(): return generate_mock_config() return textwrap.dedent(""" # ADAM/GATK Pipeline configuration file # This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon. # Edit the values in this configuration file and then rerun the pipeline # Comments (beginning with #) do not need to be removed. Optional parameters may be left blank. ############################################################################################################## pipeline-to-run: both # skip-alignment: False # skip-preprocessing: False # sequence-dir: sequence # autoscale-cluster: False # s3-bucket: # S3 Bucket URI cpu-count: # Optional: program-unit: 12345 # platform: ILLUMINA # ref: # Required: Reference fasta file amb: # Required: Reference fasta file (amb) ann: # Required: Reference fasta file (ann) bwt: # Required: Reference fasta file (bwt) pac: # Required: Reference fasta file (pac) sa: # Required: Reference fasta file (sa) fai: # Required: Reference fasta file (fai) alt: # Optional: Alternate file for reference build (alt). Necessary for alt aware alignment. phase: # Required: URL (1000G_phase1.indels.hg19.sites.fixed.vcf) mills: # Required: URL (Mills_and_1000G_gold_standard.indels.hg19.sites.vcf) dbsnp: # Required: URL (dbsnp_132_b37.leftAligned.vcf) hapmap: # Required: URL (hapmap_3.3.b37.vcf) omni: # Required: URL (1000G_omni.5.b37.vcf) trim-adapters: False # Trim adapters. num-nodes: 9 # Number of nodes to use. Do not set if providing master_ip. master-ip: # Optional: IP or hostname of host running for Spark master and HDFS namenode. # Should be provided instead of num-nodes if pointing at a static (external or # standalone) Spark cluster. The special value 'auto' indicates the master of # an externally autoscaled cgcloud spark cluster, i.e. one that is managed by # the uberscript. file-size: 100G # Approximate input file size. Should be given as %d[TGMK], e.g., # for a 100 gigabyte file, use file_size: '100G' ssec: # Optional: (string) Path to Key File for SSE-C Encryption dir-suffix: # Optional: suffix to add to output directory names. memory: # Required: Amount of available memory on each worker node. """[1:])
def generate_manifest(): if mock_mode(): return generate_mock_manifest() return textwrap.dedent(""" # Edit this manifest to include information pertaining to each sample to be run. # There is a single column: UUID # # UUID This should be a unique identifier for the sample to be processed that corresponds to # the prefix of the filenames of the input fastq files. # # Example: # If your input fastq file pairs were input_file_name_1.illumina_1.fastq.gz, input_file_name_1.illumina_2.fastq.gz and # input_file_name_2.illumina_1.fastq.gz, input_file_name_2.illumina_2.fastq.gz, the manifest would be: # # input_file_name_1.illumina # input_file_name_2.illumina # # Input fastq files MUST be named according to the filename_1.fastq.gz, filename_2.fastq.gz convention # # Place your samples below, one per line. """[1:])
def main(): """ This is a Toil pipeline used to perform alignment of fastqs. """ # Define Parser object and add to Toil if mock_mode(): usage_msg = 'You have the TOIL_SCRIPTS_MOCK_MODE environment variable set, so this pipeline ' \ 'will run in mock mode. To disable mock mode, set TOIL_SCRIPTS_MOCK_MODE=0' else: usage_msg = None parser = argparse.ArgumentParser(usage=usage_msg) subparsers = parser.add_subparsers(dest='command') subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.') subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.') # Run subparser parser_run = subparsers.add_parser('run', help='Runs the ADAM/GATK pipeline') default_config = 'adam-gatk-mock.config' if mock_mode() else 'adam-gatk.config' default_manifest = 'adam-gatk-mock-manifest.csv' if mock_mode() else 'adam-gatk-manifest.csv' parser_run.add_argument('--config', default=default_config, type=str, help='Path to the (filled in) config file, generated with "generate-config".') parser_run.add_argument('--manifest', default=default_manifest, type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s".') Job.Runner.addToilOptions(parser_run) args = parser.parse_args() cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, default_config), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, default_manifest), generate_manifest) # Pipeline execution elif args.command == 'run': require(os.path.exists(args.config), '{} not found. Please run ' 'generate-config'.format(args.config)) if not hasattr(args, 'sample'): require(os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "generate-manifest"'.format(args.manifest)) # Parse config parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()} inputs = argparse.Namespace(**parsed_config) # Parse manifest file uuid_list = [] with open(args.manifest) as f_manifest: for line in f_manifest: if not line.isspace() and not line.startswith('#'): uuid_list.append(line.strip()) inputs.sort = False if not inputs.dir_suffix: inputs.dir_suffix = '' if not inputs.s3_bucket: inputs.s3_bucket = '' if inputs.master_ip and inputs.num_nodes: raise ValueError("Exactly one of master_ip (%s) and num_nodes (%d) must be provided." % (inputs.master_ip, inputs.num_nodes)) if not hasattr(inputs, 'master_ip') and inputs.num_nodes <= 1: raise ValueError('num_nodes allocates one Spark/HDFS master and n-1 workers, and thus must be greater ' 'than 1. %d was passed.' % inputs.num_nodes) if (inputs.pipeline_to_run != "adam" and inputs.pipeline_to_run != "gatk" and inputs.pipeline_to_run != "both"): raise ValueError("pipeline_to_run must be either 'adam', 'gatk', or 'both'. %s was passed." % inputs.pipeline_to_run) Job.Runner.startToil(Job.wrapJobFn(sample_loop, uuid_list, inputs), args)