def parse_samples(path_to_manifest): """ Parses samples, specified in either a manifest or listed with --samples :param str path_to_manifest: Path to configuration file :return: Samples and their attributes as defined in the manifest :rtype: list[list] """ samples = [] with open(path_to_manifest, 'r') as f: for line in f.readlines(): if line.isspace() or line.startswith('#'): continue sample = line.strip().split('\t') if len(sample) != 2: raise UserError( 'Bad manifest format! Expected 2 tab separated columns, got: {}' .format(sample)) # If a directory is passed in, use all samples in that directory uuid, url = sample if urlparse(url).scheme == '': url = [ 'file://' + os.path.join(url, x) for x in os.listdir(url) ] # If url is a tarball elif url.endswith('tar.gz') or url.endswith('tar'): require( urlparse(url).scheme in SCHEMES, 'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES)) url = [url] # If URL is a fastq or series of fastqs elif url.endswith('fastq.gz') or url.endswith( 'fastq') or url.endswith('fq.gz') or url.endswith('fq'): url = url.split(',') [ require( urlparse(x).scheme in SCHEMES, 'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES)) for x in url ] else: raise UserError( 'URL does not have approved extension: .tar.gz, .tar, .fastq.gz, .fastq, .fq.gz, .fq' ) sample = [uuid, url] samples.append(sample) return samples
def _run_toil_marginPhase(self, identifier, partition_size, partition_margin): #prep jobStore = os.path.join(self.workdir, 'toil-jobstore') work_dir = os.path.join(self.workdir, 'toil-workdir') os.mkdir(work_dir) # run toil toil_command = [ 'toil-marginphase', 'run', '--config', self._generate_config(partition_size, partition_margin), '--manifest', self._generate_manifest(identifier), '--workDir', work_dir, jobStore ] log.info('Running %r', toil_command) subprocess.check_call(toil_command) # validate output extract_command = ['tar', 'xvf', "{}.tar.gz".format(identifier)] subprocess.check_call(extract_command, cwd=self.toil_outputdir) output_vcf_name = MarginPhaseTest.OUT_TOIL_VCF_FORMAT.format( identifier) full_merged_vcf = os.path.join(self.toil_outputdir, output_vcf_name) if not os.path.isfile(full_merged_vcf): contents = subprocess.check_output(['ls', '-la'], cwd=self.toil_outputdir) raise UserError( "toil output vcf '{}' not found in directory '{}' with contents:\n{}" .format(output_vcf_name, self.toil_outputdir, contents)) # save and return self.toil_full_merged_vcf = full_merged_vcf return full_merged_vcf
def parse_samples(config, path_to_manifest): """ Parses samples, specified in either a manifest or listed with --samples :param str path_to_manifest: Path to configuration file :return: Samples and their attributes as defined in the manifest :rtype: list[list] """ samples = [] with open(path_to_manifest, 'r') as f: for line in f.readlines(): if line.isspace() or line.startswith('#'): continue sample = line.strip().split('\t') # validate structure if len(sample) < 2: raise UserError( 'Bad manifest format! Required at least 2 tab-separated columns, got: {}' .format(sample)) if len(sample) > 6: raise UserError( 'Bad manifest format! Required at most 6 tab-separated columns, got: {}' .format(sample)) # extract sample parts uuid = sample[0] url = sample[1] contig_name, reference_url, params_url = "", "", "" if len(sample) > 2: contig_name = sample[2] if len(sample) > 3: reference_url = sample[3] if len(sample) > 4: params_url = sample[4] # fill defaults if len(contig_name) == 0: contig_name = config.default_contig if len(reference_url) == 0: reference_url = config.default_reference if len(params_url) == 0: params_url = config.default_params sample = [uuid, url, contig_name, reference_url, params_url] samples.append(sample) return samples
def _get_mount_path(self): """ Returns the path of the mount point of the current container. If this method is invoked outside of a Docker container a NotInsideContainerError is raised. Likewise if the docker daemon is unreachable from inside the container a UserError is raised. This method is idempotent. """ if self._mount_path is None: name = current_docker_container_id() if dockerd_is_reachable(): # Get name of mounted volume blob = json.loads( subprocess.check_output(['docker', 'inspect', name])) mounts = blob[0]['Mounts'] # Ensure docker.sock is mounted correctly sock_mnt = [ x['Source'] == x['Destination'] for x in mounts if 'docker.sock' in x['Source'] ] require( len(sock_mnt) == 1, 'Missing socket mount. Requires the following: ' 'docker run -v /var/run/docker.sock:/var/run/docker.sock') # Ensure formatting of command for 2 mount points if len(mounts) == 2: require( all(x['Source'] == x['Destination'] for x in mounts), 'Docker Src/Dst mount points, invoked with the -v argument, ' 'must be the same if only using one mount point aside from the docker ' 'socket.') work_mount = [ x['Source'] for x in mounts if 'docker.sock' not in x['Source'] ] else: # Ensure only one mirror mount exists aside from docker.sock mirror_mounts = [ x['Source'] for x in mounts if x['Source'] == x['Destination'] ] work_mount = [ x for x in mirror_mounts if 'docker.sock' not in x ] require( len(work_mount) == 1, 'Wrong number of mirror mounts provided, see ' 'documentation.') self._mount_path = work_mount[0] log.info('The work mount is: %s', self._mount_path) else: raise UserError( 'Docker daemon is not reachable, ensure Docker is being run with: ' '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument.' ) return self._mount_path
def require_docker_file_output(job, config, work_dir, output_filenames, function_id, log_filename=None, max_directory_contents=None, max_log_lines=None): missing_filenames = list( filter(lambda x: not os.path.exists(os.path.join(work_dir, x)), output_filenames)) if len(missing_filenames) > 0: # document missing log(job, "Missing files after docker call: ", config.uuid, function_id) for missing in missing_filenames: log(job, "\t{}".format(missing), config.uuid, function_id) # document contents directory_contents = os.listdir(work_dir) if max_directory_contents is not None and len( directory_contents) > max_directory_contents: directory_contents = directory_contents[0:max_directory_contents] directory_contents.append("[{} items total]".format( len(directory_contents))) log(job, "Current files in work_dir: {}".format(work_dir), config.uuid, function_id) for missing in directory_contents: log(job, "\t{}".format(missing), config.uuid, function_id) # document log if log_filename is not None: log_location = os.path.join(work_dir, log_filename) if os.path.isfile(log_location): log(job, "Log file contents: {}".format(log_filename), config.uuid, function_id) log_lines = 0 with open(log_location) as log_stream: for ll in log_stream: if max_log_lines is None or log_lines < max_log_lines: log(job, "\t{}".format(ll.rstrip()), config.uuid, function_id) log_lines += 1 if max_log_lines is not None and log_lines <= max_log_lines: log(job, "\t[{} lines total]".format(log_lines), config.uuid, function_id) else: log(job, "Log file {} was not found".format(log_filename), config.uuid, function_id) # die raise UserError("Missing files after running {} on {}: {}".format( function_id, config.uuid, missing_filenames))
def _run_docker_marginPhase(self, identifier): # prep shutil.copy( os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_FA), os.path.join(self.exec_outputdir, MarginPhaseTest.IN_REF_FA)) shutil.copy( os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_VCF), os.path.join(self.exec_outputdir, MarginPhaseTest.IN_REF_VCF)) shutil.copy(os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_BAM), os.path.join(self.exec_outputdir, MarginPhaseTest.IN_BAM)) shutil.copy( os.path.join(TOIL_TEST_DIR, MarginPhaseTest.IN_PARAMS), os.path.join(self.exec_outputdir, MarginPhaseTest.IN_PARAMS)) # run docker docker_command = [ 'docker', 'run', '--rm', '-v', "{}:/data".format(self.exec_outputdir), # '-it', MarginPhaseTest.DOCKER_MARGIN_PHASE, "/data/{}".format(MarginPhaseTest.IN_BAM), "/data/{}".format(MarginPhaseTest.IN_REF_FA), "-p", "/data/{}".format(MarginPhaseTest.IN_PARAMS), "-o", "/data/{}".format(identifier), # "-r", "/data/{}".format(MarginPhaseTest.IN_REF_VCF), "-a", "info", "-v", "0" ] log.info('Running %r', docker_command) subprocess.check_call(docker_command) # validate output output_vcf_name = MarginPhaseTest.OUT_EXEC_VCF_FORMAT.format( identifier) output_vcf = os.path.join(self.exec_outputdir, output_vcf_name) if not os.path.isfile(output_vcf): contents = subprocess.check_output(['ls', '-la'], cwd=self.exec_outputdir) raise UserError( "exec output vcf '{}' not found in directory '{}' with contents:\n{}" .format(output_vcf_name, self.exec_outputdir, contents)) # save and return self.exec_output_vcf = output_vcf return output_vcf
def main(): """ Computational Genomics Lab, Genomics Institute, UC Santa Cruz MarginPhase pipeline ======================================= Dependencies Curl: apt-get install curl Docker: wget -qO- https://get.docker.com/ | sh Toil: pip install toil Boto: pip install boto (OPTIONAL) """ parser = argparse.ArgumentParser( description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter) subparsers = parser.add_subparsers(dest='command') # Generate subparsers subparsers.add_parser( 'generate-config', help='Generates an editable config in the current working directory.') subparsers.add_parser( 'generate-manifest', help='Generates an editable manifest in the current working directory.' ) subparsers.add_parser( 'generate', help='Generates a config and manifest in the current working directory.' ) # Run subparser parser_run = subparsers.add_parser('run', help='Runs the MarginPhase pipeline') group = parser_run.add_mutually_exclusive_group() parser_run.add_argument( '--config', default=DEFAULT_CONFIG_NAME, type=str, help= 'Path to the (filled in) config file, generated with "generate-config". ' '\nDefault value: "%(default)s"') group.add_argument( '--manifest', default=DEFAULT_MANIFEST_NAME, type=str, help= 'Path to the (filled in) manifest file, generated with "generate-manifest". ' '\nDefault value: "%(default)s"') # If no arguments provided, print full help menu if len(sys.argv) == 1: parser.print_help() sys.exit(1) # Add Toil options Job.Runner.addToilOptions(parser_run) args = parser.parse_args() # Parse subparsers related to generation of config and manifest cwd = os.getcwd() if args.command == 'generate-config' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config) if args.command == 'generate-manifest' or args.command == 'generate': generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME), generate_manifest) # Pipeline execution elif args.command == 'run': # sanity check require( os.path.exists(args.config), '{} not found. Please run ' '"toil-marginphase generate-config"'.format(args.config)) require( os.path.exists(args.manifest), '{} not found and no samples provided. Please ' 'run "toil-marginphase generate-manifest"'.format(args.manifest)) # Parse config parsed_config = { x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems() } config = argparse.Namespace(**parsed_config) config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize config.defaultCores = int(min(MP_CPU, config.maxCores)) config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint config.maxMemory = sys.maxint # fix parsing of GB to int if args.maxMemory: args.maxMemory = args.maxMemory.upper() if args.maxMemory.endswith('B'): args.maxMemory = args.maxMemory.rstrip('B') # actual parsing if args.maxMemory.endswith('G'): config.maxMemory = int( args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024 elif args.maxMemory.endswith('M'): config.maxMemory = int( args.maxMemory.rstrip('M')) * 1024 * 1024 elif args.maxMemory.endswith('K'): config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024 else: config.maxMemory = int(args.maxMemory) # Config sanity checks require(config.output_dir, 'No output location specified') if urlparse(config.output_dir).scheme != "s3": config.output_dir = config.output_dir.replace("file://", "", 1) mkdir_p(config.output_dir) if not config.output_dir.endswith('/'): config.output_dir += '/' require(config.partition_size, "Configuration parameter partition-size is required") require(config.partition_margin, "Configuration parameter partition-margin is required") if 'save_intermediate_files' not in config or not config.save_intermediate_files: config.intermediate_file_location = None elif urlparse(config.output_dir).scheme == "s3": raise UserError( "Config parameter 'save_intermediate_files' cannot be used with s3 output directory" ) else: intermediate_location = os.path.join( config.output_dir, "intermediate", datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) mkdir_p(intermediate_location) config.intermediate_file_location = intermediate_location if "margin_phase_image" not in config or len( config.margin_phase_image) == 0: config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT if "margin_phase_tag" not in config or len( config.margin_phase_tag) == 0: config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT if "cpecan_image" not in config or len(config.cpecan_image) == 0: config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT if "cpecan_tag" not in config or len(config.cpecan_tag) == 0: config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT if "unittest" not in config: config.unittest = False if "minimal_output" not in config: config.minimal_output = False if "minimal_cpecan_output" not in config: config.minimal_cpecan_output = False if "cpecan_probabilities" not in config: config.cpecan_probabilities = False # get samples samples = parse_samples(config, args.manifest) # Program checks for program in ['docker']: require( next(which(program), None), program + ' must be installed on every node.'.format(program)) # Start the workflow Job.Runner.startToil( Job.wrapJobFn(map_job, prepare_input, samples, config), args)
def merge_chunks(job, config, chunk_infos): # prep start = time.time() uuid = config.uuid work_dir = job.fileStore.getLocalTempDir() log(job, "{}".format(datetime.datetime.now()), uuid, 'merge_chunks') log(job, "Merging {} chunks".format(len(chunk_infos)), uuid, 'merge_chunks') if config.minimal_output: log( job, "Minimal output is configured, will only save full chromosome vcf and merged BAMs", uuid, 'merge_chunks') # work directory for tar management # output files merged_chunks_directory = os.path.join(work_dir, ID_MERGED) os.mkdir(merged_chunks_directory) full_merged_vcf_file = os.path.join(merged_chunks_directory, "{}.merged.vcf".format(config.uuid)) full_merged_sam_file = os.path.join(merged_chunks_directory, "{}.merged.sam".format(config.uuid)) # sort by chunk index and validate chunk_infos.sort(key=(lambda x: x[CI_CHUNK_INDEX])) idx = 0 missing_indices = [] for ci in chunk_infos: while ci[CI_CHUNK_INDEX] > idx: missing_indices.append(idx) idx += 1 idx += 1 if len(missing_indices) > 0: log( job, "Found {} missing indices: {}".format(len(missing_indices), missing_indices), uuid, 'merge_chunks') # prep for iteration merge_decisions = dict() prev_chunk_workdir = "" prev_chunk_sam_file = None prev_chunk_vcf_file = None prev_chunk = {CI_CHUNK_INDEX: "start"} prev_written_reads = set() prev_vcf_split_pos = None prev_vcf_phase_action = None # iterate over all chunks for chunk in chunk_infos: # get current chunk info/files chunk_idx = chunk[CI_CHUNK_INDEX] chunk_boundary = chunk[CI_CHUNK_BOUNDARY_START] merging_step_identifier = "{}:{}-{}".format(config.uuid, prev_chunk[CI_CHUNK_INDEX], chunk[CI_CHUNK_INDEX]) curr_chunk_workdir = os.path.join(work_dir, "tmp-{}".format(chunk_idx)) curr_chunk_sam_file, curr_chunk_vcf_file = merge_chunks__extract_chunk_tarball( job, config, curr_chunk_workdir, chunk) log( job, "merging {} and {} across boundary {}".format( prev_chunk[CI_CHUNK_INDEX], chunk_idx, chunk_boundary), uuid, 'merge_chunks') # error out if missing files if curr_chunk_sam_file is None or curr_chunk_vcf_file is None: error = "{}: Missing expected output file, sam:{}, vcf:{}, chunk_info:{}".format( chunk_idx, curr_chunk_sam_file, curr_chunk_vcf_file, chunk) log(job, error, uuid, 'merge_chunks') job.fileStore.logToMaster(error) if CONTINUE_AFTER_FAILURE: # prev chunk info is maintained, and will be written during next chunk continue raise UserError("{}:{}".format(uuid, error)) # skip writing the first chunk if prev_chunk_sam_file is None: curr_written_reads = set() curr_vcf_split_pos = 0 curr_vcf_phase_action = dict() # write the rest of the chunks else: # get chunk splitting prev_reads, curr_reads, curr_vcf_split_pos, curr_vcf_phase_action, decision_summary =\ merge_chunks__determine_chunk_splitting(job, merging_step_identifier, prev_chunk_sam_file, curr_chunk_sam_file, chunk_boundary) merge_decisions[decision_summary] =\ merge_decisions[decision_summary] + 1 if decision_summary in merge_decisions else 1 # write sam curr_written_reads = merge_chunks__append_sam_reads( job, merging_step_identifier, prev_chunk_sam_file, full_merged_sam_file, prev_reads, prev_written_reads) if len(curr_reads) > 0: curr_written_right_reads = merge_chunks__append_sam_reads( job, merging_step_identifier, curr_chunk_sam_file, full_merged_sam_file, curr_reads, curr_written_reads) curr_written_reads = curr_written_reads.union( curr_written_right_reads) # write vcf merge_chunks__append_vcf_calls( job, merging_step_identifier, prev_chunk_vcf_file, full_merged_vcf_file, prev_vcf_split_pos, curr_vcf_split_pos, prev_vcf_phase_action, mp_identifier=prev_chunk[CI_CHUNK_INDEX]) # cleanup if os.path.isdir(prev_chunk_workdir): shutil.rmtree(prev_chunk_workdir) # iterate prev_chunk = chunk prev_chunk_workdir = curr_chunk_workdir prev_chunk_sam_file = curr_chunk_sam_file prev_chunk_vcf_file = curr_chunk_vcf_file prev_written_reads = curr_written_reads prev_vcf_split_pos = curr_vcf_split_pos prev_vcf_phase_action = curr_vcf_phase_action # write the final reads and calls merging_step_identifier = "{}:{}-{}".format(config.uuid, prev_chunk[CI_CHUNK_INDEX], "end") merge_chunks__append_sam_reads(job, merging_step_identifier, prev_chunk_sam_file, full_merged_sam_file, {None: None}, prev_written_reads) merge_chunks__append_vcf_calls(job, merging_step_identifier, prev_chunk_vcf_file, full_merged_vcf_file, prev_vcf_split_pos, sys.maxint, prev_vcf_phase_action, mp_identifier=prev_chunk[CI_CHUNK_INDEX]) # loggit log(job, "Finished merge with following matches:", uuid, 'merge_chunks') job.fileStore.logToMaster("{}:merge_chunks: ".format(config.uuid)) for decision, count in merge_decisions.items(): log(job, "\t\t{}: \t{}".format(decision, count), uuid, 'merge_chunks') # tarball the output and save log(job, "Output files for merge:".format(), uuid, 'merge_chunks') output_file_locations = glob.glob( os.path.join(merged_chunks_directory, "*.*")) output_file_locations.sort() tmp = output_file_locations output_file_locations = list() for f in tmp: if os.path.isdir(f): log(job, "\t\t{} (skipped, directory)".format(os.path.basename(f)), uuid, 'merge_chunks') else: log(job, "\t\t{}".format(os.path.basename(f)), uuid, 'merge_chunks') output_file_locations.append(f) tarball_name = "{}.merged.tar.gz".format(config.uuid) tarball_files(tar_name=tarball_name, file_paths=output_file_locations, output_dir=work_dir) output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) # we need to return the input list of chunk infos for consolidation chunk_infos.append({ CI_UUID: config.uuid, CI_OUTPUT_FILE_ID: output_file_id, CI_CHUNK_INDEX: ID_MERGED }) log_generic_job_debug(job, config.uuid, "merge_chunks", work_dir=work_dir) log_time(job, "merge_chunks", start, config.uuid) return chunk_infos
def prepare_input(job, sample, config, enqueue_consolidation=True): # job prep config = argparse.Namespace(**vars(config)) uuid, url, contig_name, reference_url, params_url = sample config.uuid = uuid config.contig_name = contig_name config.reference_url = reference_url config.params_url = params_url if config.intermediate_file_location is not None: config.intermediate_file_location = os.path.join( config.intermediate_file_location, uuid) mkdir_p(config.intermediate_file_location) work_dir = job.fileStore.getLocalTempDir() start = time.time() log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START') log( job, "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}" .format(url, contig_name, reference_url, params_url), uuid, 'prepare_input') # todo global resource estimation config.maxCores = min(config.maxCores, multiprocessing.cpu_count()) config.defaultCores = min(MP_CPU, config.maxCores) config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95)) #config.disk # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported #ref fasta if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL): ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name) job.fileStore.readGlobalFile( ref_genome_fileid, os.path.join(work_dir, ref_genome_filename)) else: download_url(reference_url, work_dir=work_dir) ref_genome_filename = os.path.basename(reference_url) ref_genome_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, ref_genome_filename)) ref_genome_size = os.stat(os.path.join(work_dir, ref_genome_filename)).st_size config.reference_genome_fileid = ref_genome_fileid #params if params_url.startswith(TOIL_JOBSTORE_PROTOCOL): params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1) else: download_url(params_url, work_dir=work_dir) params_filename = os.path.basename(params_url) params_fileid = job.fileStore.writeGlobalFile( os.path.join(work_dir, params_filename)) config.params_fileid = params_fileid # download bam if url.startswith(TOIL_JOBSTORE_PROTOCOL): bam_filename = "{}.input.{}.bam".format(uuid, contig_name) job.fileStore.readGlobalFile( url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1), os.path.join(work_dir, bam_filename)) else: download_url(url, work_dir=work_dir) bam_filename = os.path.basename(url) data_bam_location = os.path.join("/data", bam_filename) workdir_bam_location = os.path.join(work_dir, bam_filename) # index the bam _index_bam(job, config, work_dir, bam_filename) # sanity check workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai") if not os.path.isfile(workdir_bai_location): raise UserError("BAM index file not created for {}: {}".format( bam_filename, workdir_bai_location)) # get start and end location start_idx = sys.maxint end_idx = 0 with closing( pysam.AlignmentFile( workdir_bam_location, 'rb' if bam_filename.endswith("bam") else 'r')) as aln: for read in aln.fetch(): align_start = read.reference_start align_end = read.reference_end start_idx = min([start_idx, align_start]) end_idx = max([end_idx, align_end]) log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx, end_idx), uuid, 'prepare_input') # get reads from positions chunk_infos = list() idx = start_idx while idx < end_idx: ci = {CI_UUID: uuid} ci[CI_CHUNK_BOUNDARY_START] = idx chunk_start = idx - config.partition_margin ci[CI_CHUNK_START] = chunk_start idx += config.partition_size ci[CI_CHUNK_BOUNDARY_END] = idx chunk_end = idx + config.partition_margin ci[CI_CHUNK_END] = chunk_end chunk_infos.append(ci) # enqueue jobs log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid, 'prepare_input') idx = 0 enqueued_jobs = 0 returned_tarballs = list() for ci in chunk_infos: #prep ci[CI_CHUNK_INDEX] = idx chunk_start = ci[CI_CHUNK_START] chunk_end = ci[CI_CHUNK_END] chunk_position_description = "{}:{}-{}".format(config.contig_name, chunk_start, chunk_end) bam_split_command = [ "view", "-b", data_bam_location, chunk_position_description ] chunk_name = "{}.{}.bam".format(config.uuid, idx) #write chunk chunk_location = os.path.join(work_dir, chunk_name) with open(chunk_location, 'w') as out: docker_call(job, config, work_dir, bam_split_command, DOCKER_SAMTOOLS_IMG, DOCKER_SAMTOOLS_TAG, outfile=out) #document read count chunk_size = os.stat(chunk_location).st_size ci[CI_CHUNK_SIZE] = chunk_size ci[CI_REF_FA_SIZE] = ref_genome_size read_count = prepare_input__get_bam_read_count(job, work_dir, chunk_name) ci[CI_READ_COUNT] = read_count log( job, "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format( chunk_position_description, idx, chunk_size, int(chunk_size / 1024 / 1024), read_count), uuid, 'prepare_input') if config.intermediate_file_location is not None: copy_files(file_paths=[chunk_location], output_dir=config.intermediate_file_location) # enqueue marginPhase job if read_count > 0: chunk_fileid = job.fileStore.writeGlobalFile(chunk_location) mp_cores = config.defaultCores mp_mem = int( min( int(chunk_size * MP_MEM_BAM_FACTOR + ref_genome_size * MP_MEM_REF_FACTOR), config.maxMemory)) mp_disk = int( min( int(chunk_size * MP_DSK_BAM_FACTOR + ref_genome_size * MP_DSK_REF_FACTOR + (0 if config.cpecan_probabilities else MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk)) log( job, "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format( mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem, int(mp_mem / 1024 / 1024 / 1024)), "{}.{}".format(uuid, idx), 'prepare_input') mp_mem = str(int(mp_mem / 1024)) + "K" mp_disk = str(int(mp_disk) / 1024) + "K" margin_phase_job = job.addChildJobFn(run_margin_phase, config, chunk_fileid, ci, memory=mp_mem, cores=mp_cores, disk=mp_disk) returned_tarballs.append(margin_phase_job.rv()) enqueued_jobs += 1 idx += 1 log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input') # enqueue merging and consolidation job merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs) final_return_value = merge_job.rv() if enqueue_consolidation: consolidation_job = merge_job.addFollowOnJobFn(consolidate_output, config, merge_job.rv()) final_return_value = consolidation_job.rv() # log log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir) log_time(job, "prepare_input", start, config.uuid) # return appropriate output return final_return_value
def run_margin_phase(job, config, chunk_file_id, chunk_info): # prep start = time.time() work_dir = job.fileStore.getLocalTempDir() chunk_idx = chunk_info[CI_CHUNK_INDEX] chunk_identifier = "{}.{}".format(config.uuid, chunk_idx) chunk_name = "{}.in.bam".format(chunk_identifier) chunk_location = os.path.join(work_dir, chunk_name) log(job, str(datetime.datetime.now()), chunk_identifier, 'run_margin_phase') # download bam chunk job.fileStore.readGlobalFile(chunk_file_id, chunk_location) if not os.path.isfile(chunk_location): raise UserError("Failed to download chunk {} from {}".format( chunk_name, chunk_file_id)) # download references #ref genome genome_reference_name = "reference.fa" genome_reference_location = os.path.join(work_dir, genome_reference_name) job.fileStore.readGlobalFile(config.reference_genome_fileid, genome_reference_location) if not os.path.isfile(genome_reference_location): raise UserError( "Failed to download genome reference {} from {}".format( os.path.basename(config.reference_genome), config.reference_genome_fileid)) # params params_name = "params.json" params_location = os.path.join(work_dir, params_name) job.fileStore.readGlobalFile(config.params_fileid, params_location) if not os.path.isfile(params_location): raise UserError("Failed to download params {} from {}".format( os.path.basename(config.params), config.params_fileid)) # do we want to run cPecan? cpecan_prob_location = None if config.cpecan_probabilities: cpecan_prob_location = run_margin_phase__run_cpecan_alignment( job, config, chunk_identifier, work_dir, chunk_name, genome_reference_name) # run marginPhase params = [ os.path.join("/data", chunk_name), os.path.join("/data", genome_reference_name), os.path.join("/data", params_name), "-o", os.path.join("/data", "{}.out".format(chunk_identifier)), '--tag', "{},{}-{}".format(chunk_idx, chunk_info[CI_CHUNK_BOUNDARY_START], chunk_info[CI_CHUNK_BOUNDARY_END]) ] if cpecan_prob_location is not None: params.extend([ '--singleNuclProbDir', os.path.join("/data", cpecan_prob_location) ]) docker_call(job, config, work_dir, params, config.margin_phase_image, config.margin_phase_tag) log_debug_from_docker(job, os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), chunk_identifier, 'margin_phase', [chunk_location, genome_reference_location]) log_location = os.path.join(work_dir, "marginPhase.{}.log".format(chunk_identifier)) os.rename(os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), log_location) # document output log(job, "Output files after marginPhase:", chunk_identifier, 'run_margin_phase') output_file_locations = glob.glob( os.path.join(work_dir, "{}*".format(chunk_identifier))) output_file_locations.append(log_location) found_vcf, found_sam = False, False for f in output_file_locations: log(job, "\t\t{}".format(os.path.basename(f)), chunk_identifier, 'run_margin_phase') if f.endswith(VCF_SUFFIX): found_vcf = True if f.endswith(SAM_UNIFIED_SUFFIX): found_sam = True if cpecan_prob_location is not None: cpecan_tarball = glob.glob( os.path.join(work_dir, cpecan_prob_location, "*.tar.gz")) if len(cpecan_tarball) == 0: # todo why has tarball_files failed in this location? log(job, "Found no cpecan output tarball! Trying alt location.", chunk_identifier, 'run_margin_phase') cpecan_tarball = glob.glob(os.path.join(work_dir, "*.tar.gz")) if len(cpecan_tarball) == 0: log(job, "Found no cpecan output tarball!", chunk_identifier, 'run_margin_phase') elif len(cpecan_tarball) > 1: log( job, "Found {} cpecan output tarballs: {}".format( len(cpecan_tarball), cpecan_tarball), chunk_identifier, 'run_margin_phase') else: log(job, "Saving cpecan output tarball: {}".format(cpecan_tarball[0]), chunk_identifier, 'run_margin_phase') output_file_locations.append(cpecan_tarball[0]) # tarball the output and save tarball_name = "{}.tar.gz".format(chunk_identifier) tarball_files(tar_name=tarball_name, file_paths=output_file_locations, output_dir=work_dir) # validate output, retry if not if not (found_sam and found_vcf): if "retry_attempts" not in config: config.retry_attempts = 1 else: config.retry_attempts += 1 if config.retry_attempts > MAX_RETRIES: log(job, "", chunk_identifier, 'run_margin_phase') error = "Failed to generate appropriate output files {} times".format( MAX_RETRIES) log(job, error, chunk_identifier, 'run_margin_phase') # this enables us to "recover" in the face of failure during a run if CONTINUE_AFTER_FAILURE: output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) chunk_info[CI_OUTPUT_FILE_ID] = output_file_id return chunk_info raise UserError("{}:{}".format(chunk_identifier, error)) log( job, "Missing output files. Attepmting retry {}".format( config.retry_attempts), chunk_identifier, 'run_margin_phase') log(job, "Failed job log file:", chunk_identifier, 'run_margin_phase') log(job, "", chunk_identifier, 'run_margin_phase') with open(log_location, 'r') as input: for line in input: log(job, "\t\t{}".format(line.rstrip()), chunk_identifier, 'run_margin_phase') # new job retry_job = job.addChildJobFn( run_margin_phase, config, chunk_file_id, chunk_info, memory=str(int(config.maxMemory / 1024)) + "K", cores=job.cores, disk=job.disk) # save failed output if config.intermediate_file_location is not None: tarball_fail_name = "{}.FAILURE.{}.tar.gz".format( chunk_identifier, config.retry_attempts) os.rename(os.path.join(work_dir, tarball_name), os.path.join(work_dir, tarball_fail_name)) copy_files(file_paths=[os.path.join(work_dir, tarball_fail_name)], output_dir=config.intermediate_file_location) log_generic_job_debug(job, config.uuid, 'run_margin_phase', work_dir=work_dir) return retry_job.rv() # if successfull, save output if config.intermediate_file_location is not None: copy_files(file_paths=[os.path.join(work_dir, tarball_name)], output_dir=config.intermediate_file_location) output_file_id = job.fileStore.writeGlobalFile( os.path.join(work_dir, tarball_name)) chunk_info[CI_OUTPUT_FILE_ID] = output_file_id # log log_generic_job_debug(job, config.uuid, 'run_margin_phase', work_dir=work_dir) log_time(job, "run_margin_phase", start, chunk_identifier) return chunk_info
def process_sample(job, config, input_tar=None, fastq_ids=None): """ Converts sample.tar(.gz) into a fastq pair (or single fastq if single-ended.) :param JobFunctionWrappingJob job: passed automatically by Toil :param Namespace config: Argparse Namespace object containing argument inputs :param FileID input_tar: fileStoreID of the tarball (if applicable) :param list[FileID] fastq_ids: FileStoreIDs of fastq files :return: FileStoreID from Cutadapt or from fastqs directly if pipeline was run without Cutadapt option :rtype: tuple(FileID, FileID) """ job.fileStore.logToMaster('Processing sample: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() processed_r1, processed_r2 = None, None # I/O if input_tar: job.fileStore.readGlobalFile(input_tar, os.path.join(work_dir, 'sample.tar')) tar_path = os.path.join(work_dir, 'sample.tar') # Untar File and concat subprocess.check_call(['tar', '-xvf', tar_path, '-C', work_dir], stderr=PIPE, stdout=PIPE) job.fileStore.deleteGlobalFile(input_tar) else: ext = '.fq.gz' if config.gz else '.fq' for i, fastq_id in enumerate(fastq_ids): if i % 2 == 0: job.fileStore.readGlobalFile( fastq_id, os.path.join(work_dir, 'Fastq_{}_R1{}'.format(i, ext))) else: job.fileStore.readGlobalFile( fastq_id, os.path.join(work_dir, 'Fastq_{}_R2{}'.format(i, ext))) fastqs = [] for root, subdir, files in os.walk(work_dir): fastqs.extend([os.path.join(root, x) for x in files]) if config.paired: r1, r2 = [], [] # Pattern convention: Look for "R1" / "R2" in the filename, or "_1" / "_2" before the extension pattern = re.compile('(?:^|[._-])(R[12]|[12]\.f)') for fastq in sorted(fastqs): match = pattern.search(os.path.basename(fastq)) if not match: raise UserError( 'FASTQ file name fails to meet required convention for paired reads ' '(see documentation). ' + fastq) elif '1' in match.group(): r1.append(fastq) elif '2' in match.group(): r2.append(fastq) else: assert False, match.group() require( len(r1) == len(r2), 'Check fastq names, uneven number of pairs found.\nr1: {}\nr2: {}'. format(r1, r2)) # Concatenate fastqs command = 'zcat' if r1[0].endswith('.gz') and r2[0].endswith( '.gz') else 'cat' # If sample is already a single R1 / R2 fastq if command == 'cat' and len(fastqs) == 2: processed_r1 = fastqs[0] processed_r2 = fastqs[1] else: with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f1: p1 = subprocess.Popen([command] + r1, stdout=f1) with open(os.path.join(work_dir, 'R2.fastq'), 'w') as f2: p2 = subprocess.Popen([command] + r2, stdout=f2) p1.wait() p2.wait() processed_r1 = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R1.fastq')) processed_r2 = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R2.fastq')) disk = 2 * (processed_r1.size + processed_r2.size) else: command = 'zcat' if fastqs[0].endswith('.gz') else 'cat' if command == 'cat' and len(fastqs) == 1: processed_r1 = fastqs[0] else: with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f: subprocess.check_call([command] + fastqs, stdout=f) processed_r1 = job.fileStore.writeGlobalFile( os.path.join(work_dir, 'R1.fastq')) disk = 2 * processed_r1.size # Start cutadapt step if config.cutadapt: return job.addChildJobFn(run_cutadapt, processed_r1, processed_r2, config.fwd_3pr_adapter, config.rev_3pr_adapter, disk=disk).rv() else: return processed_r1, processed_r2
def run(cls, name, desc): """ Prepares and runs the pipeline. Note this method must be invoked both from inside a Docker container and while the docker daemon is reachable. :param str name: The name of the command to start the workflow. :param str desc: The description of the workflow. """ wrapper = cls(name, desc) mount_path = wrapper._get_mount_path() # prepare parser arg_parser = wrapper._create_argument_parser() wrapper._extend_argument_parser(arg_parser) # prepare config file empty_config = wrapper.__get_empty_config() config_yaml = ruamel.yaml.load(empty_config) wrapper.__populate_parser_from_config(arg_parser, config_yaml) args = arg_parser.parse_args() for k, v in vars(args).items(): k = k.replace('_', '-') if k in config_yaml: config_yaml[k] = v config_path = wrapper._get_config_path() with open(config_path, 'w') as writable: ruamel.yaml.dump(config_yaml, stream=writable) # prepare workdir workdir_path = os.path.join(mount_path, 'Toil-' + wrapper._name) if os.path.exists(workdir_path): if args.restart: log.info('Reusing temporary directory: %s', workdir_path) else: raise UserError( 'Temporary directory {} already exists. Run with --restart ' 'option or remove directory.'.format(workdir_path)) else: os.makedirs(workdir_path) log.info('Temporary directory created: %s', workdir_path) command = wrapper._create_pipeline_command(args, workdir_path, config_path) wrapper._extend_pipeline_command(command, args) # run command try: subprocess.check_call(command) except subprocess.CalledProcessError as e: print(e, file=sys.stderr) finally: stat = os.stat(mount_path) log.info( 'Pipeline terminated, changing ownership of output files in %s from root to ' 'uid %s and gid %s.', mount_path, stat.st_uid, stat.st_gid) chown_command = [ 'chown', '-R', '%s:%s' % (stat.st_uid, stat.st_gid), mount_path ] subprocess.check_call(chown_command) if args.no_clean: log.info( 'Flag "--no-clean" was used, therefore %s was not deleted.', workdir_path) else: log.info('Cleaning up temporary directory: %s', workdir_path) shutil.rmtree(workdir_path)
def _compare_output(self, work_dir, identifier, docker_vcf_name, toil_vcf_name): # prep - get required files shutil.copy( os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_VCF), os.path.join(work_dir, MarginPhaseTest.IN_REF_VCF)) shutil.copy(os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_FA), os.path.join(work_dir, MarginPhaseTest.IN_REF_FA)) reference_sdf_name = "SDF" #bgzip vcf_bgzip_command = [ 'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir), MarginPhaseTest.DOCKER_RTG_TOOLS, "bgzip", "/data/{}".format(docker_vcf_name), "/data/{}".format(toil_vcf_name), "/data/{}".format(MarginPhaseTest.IN_REF_VCF) ] log.info('Running %r', vcf_bgzip_command) subprocess.check_call(vcf_bgzip_command) #index vcf_index_command = [ 'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir), MarginPhaseTest.DOCKER_RTG_TOOLS, "index", "/data/{}.gz".format(docker_vcf_name), "/data/{}.gz".format(toil_vcf_name), "/data/{}.gz".format(MarginPhaseTest.IN_REF_VCF) ] log.info('Running %r', vcf_index_command) subprocess.check_call(vcf_index_command) #sdf ref_sdf_command = [ 'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir), MarginPhaseTest.DOCKER_RTG_TOOLS, "format", "-o", "/data/{}".format(reference_sdf_name), "/data/{}".format(MarginPhaseTest.IN_REF_FA) ] log.info('Running %r', ref_sdf_command) subprocess.check_call(ref_sdf_command) # vcf eval prep toil_to_docker_eval_identifier = "{}-vcfeval_t2d".format(identifier) toil_to_ref_eval_identifier = "{}-vcfeval_t2r".format(identifier) docker_to_ref_eval_identifier = "{}-vcfeval_d2r".format(identifier) vcf_eval_base = [ 'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir), MarginPhaseTest.DOCKER_RTG_TOOLS, "vcfeval", "-t", os.path.join("/data", reference_sdf_name) ] # EVAL: toil to docker vcf_eval_command = list(vcf_eval_base) vcf_eval_command.extend([ "-o", os.path.join("/data" if MarginPhaseTest.DEBUG else "/tmp", toil_to_docker_eval_identifier), "-b", "/data/{}.gz".format(docker_vcf_name), "-c", "/data/{}.gz".format(toil_vcf_name) ]) log.info('Running %r', vcf_eval_command) t2d_vcf_eval_output = subprocess.check_output(vcf_eval_command) if MarginPhaseTest.DEBUG: shutil.copytree( os.path.join(work_dir, toil_to_docker_eval_identifier), os.path.join(TOIL_TEST_STORAGE_DIR, toil_to_docker_eval_identifier)) # EVAL: toil to reference vcf_eval_command = list(vcf_eval_base) vcf_eval_command.extend([ "-o", os.path.join("/data" if MarginPhaseTest.DEBUG else "/tmp", toil_to_ref_eval_identifier), "-b", "/data/{}.gz".format(MarginPhaseTest.IN_REF_VCF), "-c", "/data/{}.gz".format(toil_vcf_name) ]) log.info('Running %r', vcf_eval_command) t2r_vcf_eval_output = subprocess.check_output(vcf_eval_command) if MarginPhaseTest.DEBUG: shutil.copytree( os.path.join(work_dir, toil_to_ref_eval_identifier), os.path.join(TOIL_TEST_STORAGE_DIR, toil_to_ref_eval_identifier)) # EVAL: docker to reference vcf_eval_command = list(vcf_eval_base) vcf_eval_command.extend([ "-o", os.path.join("/data" if MarginPhaseTest.DEBUG else "/tmp", docker_to_ref_eval_identifier), "-b", "/data/{}.gz".format(MarginPhaseTest.IN_REF_VCF), "-c", "/data/{}.gz".format(docker_vcf_name) ]) log.info('Running %r', vcf_eval_command) d2r_vcf_eval_output = subprocess.check_output(vcf_eval_command) if MarginPhaseTest.DEBUG: shutil.copytree( os.path.join(work_dir, docker_to_ref_eval_identifier), os.path.join(TOIL_TEST_STORAGE_DIR, docker_to_ref_eval_identifier)) # now we analyze docker and toil as compared to the reference t2r_vcf_eval = t2d_vcf_eval_output.split("\n") d2r_vcf_eval = d2r_vcf_eval_output.split("\n") if len(t2r_vcf_eval) < 3 or len(d2r_vcf_eval) < 3: raise UserError( "Incorrect format for vcf eval output: len {}/{} (expected 3)". format(len(t2r_vcf_eval), len(d2r_vcf_eval))) header = t2r_vcf_eval[0].split() precision_idx = None sensitivity_idx = None idx = 0 while idx < len(header): if header[idx] == "Precision": precision_idx = idx if header[idx] == "Sensitivity": sensitivity_idx = idx idx += 1 t2r_precision = float(t2r_vcf_eval[2].split()[precision_idx]) t2r_sensitivity = float(t2r_vcf_eval[2].split()[sensitivity_idx]) d2r_precision = float(d2r_vcf_eval[2].split()[precision_idx]) d2r_sensitivity = float(d2r_vcf_eval[2].split()[sensitivity_idx]) precision_diff = abs(t2r_precision - d2r_precision) sensitivity_diff = abs(t2r_sensitivity - d2r_sensitivity) if precision_diff > MarginPhaseTest.ACCEPTABLE_PRECISION_DIFFERENCE \ or sensitivity_diff > MarginPhaseTest.ACCEPTABLE_SENSITIVITY_DIFFERENCE: self.fail(( "Toil and Docker marginPhase runs have unacceptable difference when compared to the reference:\n" "\tPRECISION \tToil:%5f\tDocker:%5f\tDiff:%5f\n" "\tSENSITIVITY\tToil:%5f\tDocker:%5f\tDiff:%5f") % (t2r_precision, d2r_precision, precision_diff, t2r_sensitivity, d2r_sensitivity, sensitivity_diff)) return "\nTOIL to DOCKER:\n{}\nTOIL to REFERENCE:\n{}\nDOCKER to REFERENCE:\n{}".format( t2d_vcf_eval_output, t2r_vcf_eval_output, d2r_vcf_eval_output)