def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): root_job = Job() job.addChild(root_job) # run clip-vg on each input clipped_vg_ids = [] for vg_path, vg_id in zip(options.vg, vg_ids): clip_job = root_job.addChildJobFn(clip_vg, options, config, vg_path, vg_id, disk=vg_id.size * 2, memory=vg_id.size * 4) clipped_vg_ids.append(clip_job.rv()) # join the ids join_job = root_job.addFollowOnJobFn(join_vg, options, config, clipped_vg_ids, disk=sum([f.size for f in vg_ids])) clipped_vg_ids = join_job.rv() # make a gfa for each gfa_root_job = Job() join_job.addFollowOn(gfa_root_job) clipped_gfa_ids = [] for i in range(len(options.vg)): vg_path = options.vg[i] clipped_id = join_job.rv(i) vg_id = vg_ids[i] gfa_job = gfa_root_job.addChildJobFn(vg_to_gfa, options, config, vg_path, clipped_id, disk=vg_id.size * 5) clipped_gfa_ids.append(gfa_job.rv()) # merge up the gfas and make the various vg indexes gfa_merge_job = gfa_root_job.addFollowOnJobFn( vg_indexes, options, config, clipped_gfa_ids, cores=options.indexCores, disk=sum(f.size for f in vg_ids) * 5) if hal_ids: merge_hal_id = job.addChildJobFn(merge_hal, options, hal_ids, disk=sum(f.size for f in hal_ids) * 2).rv() else: merge_hal_id = None return clipped_vg_ids, gfa_merge_job.rv(), merge_hal_id
def split_minimap_fallback(job, options, config, seqIDMap, output_id_map): """ take the output table from gather_fas, pull out the ambiguous sequences, remap them to the reference, and add them to the events where possible""" # can't do anything without a reference if not options.reference: logger.info("Skipping minimap2 fallback as --reference was not specified") return None, None # todo: also skip if no ambgious sequences ref_path, ref_id = seqIDMap[options.reference] mm_mem = ref_id.size * 5 if seqIDMap[options.reference][0].endswith('.gz'): mm_mem *= 4 mm_index_job = job.addChildJobFn(minimap_index, ref_path, ref_id, disk=ref_id.size * 5, memory=mm_mem) mm_map_root_job = Job() mm_index_job.addFollowOn(mm_map_root_job) amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") if amb_name not in output_id_map: logger.info("Skipping minmap2 fallback as no ambigious sequences found") return None, None # map every ambgiuous sequence against the reference in parallel paf_ids = [] ambiguous_seq_id_map = {} for event, fa_id in output_id_map[amb_name]['fa'].items(): paf_job = mm_map_root_job.addChildJobFn(minimap_map, mm_index_job.rv(), event, fa_id, seqIDMap[event][0], disk=ref_id.size * 3, memory=mm_mem) paf_ids.append(paf_job.rv()) ambiguous_seq_id_map[event] = (seqIDMap[event][0], fa_id) return paf_ids, ambiguous_seq_id_map
def run_all_bam_caller(job, context, fasta_file_id, bam_file_id, bam_idx_id, sample_name, chroms, offsets, out_name, bam_caller, bam_caller_opts = []): """ run freebayes or platypus on a set of chromosomal regions. this is done by sending each region to a child job and farming off the entire input to each (ie not splitting the input) """ # to encapsulate everything under this job child_job = Job() job.addChild(child_job) fb_vcf_ids = [] fb_tbi_ids = [] fb_timers = [] assert chroms if not offsets: offsets = [None] * len(chroms) for chrom, offset in zip(chroms, offsets): fb_job = child_job.addChildJobFn(run_bam_caller, context, fasta_file_id, bam_file_id, bam_idx_id, sample_name, chrom, offset, out_name, bam_caller, bam_caller_opts, memory=context.config.calling_mem, disk=context.config.calling_disk) fb_vcf_ids.append(fb_job.rv(0)) fb_tbi_ids.append(fb_job.rv(1)) fb_timers.append([fb_job.rv(2)]) merge_vcf_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, fb_vcf_ids, fb_tbi_ids, write_to_outstore = True, call_timers_lists = fb_timers) return merge_vcf_job.rv()
def testJobConcurrency(self): """ Tests that the batch system is allocating core resources properly for concurrent tasks. """ for coresPerJob in self.allocatedCores: tempDir = self._createTempDir('testFiles') options = self.getOptions(tempDir) counterPath = os.path.join(tempDir, 'counter') resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) root = Job() for _ in range(self.cpuCount): root.addFollowOn( Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=coresPerJob, memory='1M', disk='1Mi')) Job.Runner.startToil(root, options) _, maxValue = getCounters(counterPath) self.assertEqual(maxValue, old_div(self.cpuCount, coresPerJob))
def testPromisedRequirementStatic(self): """ Asserts that promised core resources are allocated properly using a static DAG """ for coresPerJob in self.allocatedCores: tempDir = self._createTempDir('testFiles') counterPath = self.getCounterPath(tempDir) root = Job() one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M') thirtyTwoMb = Job.wrapFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M') root.addChild(one) root.addChild(thirtyTwoMb) for _ in range(self.cpuCount): root.addFollowOn( Job.wrapFn(batchSystemTest.measureConcurrency, counterPath, cores=PromisedRequirement( lambda x: x * coresPerJob, one.rv()), memory=PromisedRequirement( thirtyTwoMb.rv()), disk='1M')) Job.Runner.startToil(root, self.getOptions(tempDir)) _, maxValue = batchSystemTest.getCounters(counterPath) self.assertEqual(maxValue, self.cpuCount / coresPerJob)
def minigraph_map_all(job, config, gfa_id, fa_id_map): """ top-level job to run the minigraph mapping in parallel, returns paf """ # hang everything on this job, to self-contain workflow top_job = Job() job.addChild(top_job) # do the mapping gaf_ids = [] for event, fa_id in fa_id_map.items(): RealtimeLogger.info("adding child event={} faid={} gfaid={}".format( event, fa_id, gfa_id)) minigraph_map_job = top_job.addChildJobFn(minigraph_map_one, config, event, fa_id, gfa_id, cores=1, disk=5 * (fa_id.size + gfa_id.size)) gaf_ids.append(minigraph_map_job.rv()) # convert to paf paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_ids) return paf_job.rv()
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved, xg_file_id, paths): """ split the fastq, then surject each chunk. returns outputgams, paired with total surject time (excluding toil-vg overhead such as transferring and splitting files )""" # to encapsulate everything under this job child_job = Job() job.addChild(child_job) if not context.config.single_reads_chunk: reads_chunk_ids = child_job.addChildJobFn( run_split_reads, context, None, 'aln.gam', None, [gam_input_reads_id], cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() else: RealtimeLogger.info( "Bypassing reads splitting because --single_reads_chunk enabled") reads_chunk_ids = [[r] for r in [gam_input_reads_id]] return child_job.addFollowOnJobFn(run_whole_surject, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def testNewJobsCanHandleOtherJobDeaths(self): """ Create 2 non-local files and then create 2 jobs. The first job registers a deferred job to delete the second non-local file, deletes the first non-local file and then kills itself. The second job waits for the first file to be deleted, then sleeps for a few seconds and then spawns a child. the child of the second does nothing. However starting it should handle the untimely demise of the first job and run the registered deferred function that deletes the first file. We assert the absence of the two files at the end of the run. """ # There can be no retries self.options.retryCount = 0 workdir = self._createTempDir(purpose='nonLocalDir') nonLocalFile1 = os.path.join(workdir, str(uuid4())) nonLocalFile2 = os.path.join(workdir, str(uuid4())) open(nonLocalFile1, 'w').close() open(nonLocalFile2, 'w').close() assert os.path.exists(nonLocalFile1) assert os.path.exists(nonLocalFile2) files = [nonLocalFile1, nonLocalFile2] root = Job() A = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_A, files=files) B = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_B, files=files) C = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_C, files=files, expectedResult=False) root.addChild(A) root.addChild(B) B.addChild(C) try: Job.Runner.startToil(root, self.options) except FailedJobsException as e: pass
def testConcurrencyWithDisk(self): """ Tests that the batch system is allocating disk resources properly """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir from toil import physicalDisk availableDisk = physicalDisk('', toilWorkflowDir=options.workDir) options.batchSystem = self.batchSystemName counterPath = os.path.join(tempDir, 'counter') resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) root = Job() # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The # batchsystem should not allow the 2 child jobs to run concurrently. root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=old_div(availableDisk,2))) root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=(old_div(availableDisk, 2)) + 500)) Job.Runner.startToil(root, options) _, maxValue = getCounters(counterPath) self.assertEqual(maxValue, 1)
def makeWorkflow(): job = Job() r1 = job.addService(ToySerializableService("woot1")) r2 = job.addService(ToySerializableService("woot2")) r3 = job.addService(ToySerializableService("woot3")) job.addChildFn(fnTest, [r1, r2, r3], outFile) return job
def addJob(self, jobShape, preemptable=False): """ Add a job to the job queue """ self.totalJobs += 1 jobID = uuid.uuid4() self.jobBatchSystemIDToIssuedJob[jobID] = Job(memory=jobShape.memory, cores=jobShape.cores, disk=jobShape.disk, preemptable=preemptable) self.jobQueue.put(jobID)
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf): """ top-level job to run the minigraph mapping in parallel, returns paf """ # hang everything on this job, to self-contain workflow top_job = Job() job.addChild(top_job) mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "cpu", typeFn=int, default=1) mg_cores = min(mg_cores, cpu_count()) # doing the paf conversion is more efficient when done separately for each genome. we can get away # with doing this if the universal filter (which needs to process everything at once) is disabled xml_node = findRequiredNode(config.xmlRoot, "graphmap") paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter", float) # do the mapping gaf_id_map = {} paf_id_map = {} for event, fa_path_fa_id in fa_id_map.items(): fa_path = fa_path_fa_id[0] fa_id = fa_path_fa_id[1] minigraph_map_job = top_job.addChildJobFn( minigraph_map_one, config, event, fa_path, fa_id, gfa_id, keep_gaf or not paf_per_genome, paf_per_genome, # todo: estimate RAM cores=mg_cores, disk=5 * (fa_id.size + gfa_id.size)) gaf_id_map[event] = minigraph_map_job.rv(0) paf_id_map[event] = minigraph_map_job.rv(1) # convert to paf if paf_per_genome: paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map) else: paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_id_map) if not keep_gaf: gaf_id_map = None else: gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv() return paf_job.rv(), gaf_id_map
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path, paf_id, paf_path, ref_contigs, other_contig): root_job = Job() job.addChild(root_job) # get the sizes before we overwrite below gfa_size = gfa_id.size paf_size = paf_id.size # use file extension to sniff out compressed input if gfa_path.endswith(".gz"): gfa_id = root_job.addChildJobFn(unzip_gz, gfa_path, gfa_id, disk=gfa_id.size * 10).rv() gfa_size *= 10 if paf_path.endswith(".gz"): paf_id = root_job.addChildJobFn(unzip_gz, paf_path, paf_id, disk=paf_id.size * 10).rv() paf_size *= 10 mask_bed_id = None if options.maskFilter: mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap, options.maskFilter).rv() # use rgfa-split to split the gfa and paf up by contig split_gfa_job = root_job.addFollowOnJobFn(split_gfa, config, gfa_id, [paf_id], ref_contigs, other_contig, options.reference, mask_bed_id, disk=(gfa_size + paf_size) * 5) # use the output of the above splitting to do the fasta splitting split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap, split_gfa_job.rv(0)) # gather everythign up into a table gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap, split_gfa_job.rv(0), split_fas_job.rv()) # try splitting the ambiguous sequences using minimap2, which is more sensitive in some cases remap_job = gather_fas_job.addFollowOnJobFn(split_minimap_fallback, options, config, seqIDMap, gather_fas_job.rv()) # partition these into fasta files split_fallback_gfa_job = remap_job.addFollowOnJobFn(split_gfa, config, None, remap_job.rv(0), ref_contigs, other_contig, options.reference, None, disk=(gfa_size + paf_size) * 5) # use the output of the above to split the ambiguous fastas split_fallback_fas_job = split_fallback_gfa_job.addFollowOnJobFn(split_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0)) # gather the fallback contigs into a table gather_fallback_fas_job = split_fallback_fas_job.addFollowOnJobFn(gather_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0), split_fallback_fas_job.rv()) # combine the split sequences with the split ambigious sequences combine_split_job = gather_fallback_fas_job.addFollowOnJobFn(combine_splits, options, config, seqIDMap, gather_fas_job.rv(), gather_fallback_fas_job.rv()) # return all the files, as well as the 2 split logs return (seqIDMap, combine_split_job.rv(), split_gfa_job.rv(1), split_fallback_gfa_job.rv(1))
def run_analysis(job, context, ped_file_id, cohort_vcf_id, maternal_bam_id, maternal_bai_id, paternal_bam_id, paternal_bai_id, sibling_bam_ids, sibling_bai_ids, sample_name, maternal_name, paternal_name, sibling_names, sibling_genders, sibling_affected, bypass, cadd_lines, chrom_dir, edit_dir, split_lines, genome_build, cadd_data_dir): """ run vcf to shebang varsifter file conversion, then do cadd scoring and annotation, finally run the blackmagiktoolbox workflow. returns final candidate varsifter file, paired with total surject time (excluding toil-vg overhead such as transferring and splitting files )""" # to encapsulate everything under this job child_job = Job() job.addChild(child_job) mosaicism_detecting_job = child_job.addChildJobFn( run_detect_mosaicism, context, ped_file_id, cohort_vcf_id, sample_name, cores=context.config.misc_cores, memory=context.config.alignment_mem, disk=context.config.misc_disk) vcf_to_shebang_job = child_job.addChildJobFn( run_vcftoshebang, context, cohort_vcf_id, maternal_bam_id, maternal_bai_id, paternal_bam_id, paternal_bai_id, sibling_bam_ids, sibling_bai_ids, sample_name, maternal_name, paternal_name, sibling_names, sibling_genders, sibling_affected, bypass, cadd_lines, chrom_dir, edit_dir, split_lines, genome_build, cadd_data_dir, cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) return vcf_to_shebang_job.rv(), mosaicism_detecting_job.rv()
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path, paf_id, paf_path, ref_contigs, other_contig): root_job = Job() job.addChild(root_job) # get the sizes before we overwrite below gfa_size = gfa_id.size paf_size = paf_id.size # use file extension to sniff out compressed input if gfa_path.endswith(".gz"): gfa_id = root_job.addChildJobFn(unzip_gz, gfa_path, gfa_id, disk=gfa_id.size * 10).rv() gfa_size *= 10 if paf_path.endswith(".gz"): paf_id = root_job.addChildJobFn(unzip_gz, paf_path, paf_id, disk=paf_id.size * 10).rv() paf_size *= 10 mask_bed_id = None if options.maskFilter: mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap, options.maskFilter).rv() # use rgfa-split to split the gfa and paf up by contig split_gfa_job = root_job.addFollowOnJobFn(split_gfa, config, gfa_id, paf_id, ref_contigs, other_contig, options.reference, mask_bed_id, disk=(gfa_size + paf_size) * 5) # use the output of the above splitting to do the fasta splitting split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap, split_gfa_job.rv()) # gather everythign up into a table gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap, split_gfa_job.rv(), split_fas_job.rv()) # return all the files return gather_fas_job.rv()
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project, checkpointInfo, doRenaming, pafInput, pafSecondaries, doVG, doGFA, delay=0, eventNameAsID=False): # this option (--stagger) can be used in batch mode to avoid starting all the alignment jobs at the same time time.sleep(delay) head_job = Job() job.addChild(head_job) # allow for input in paf format: if pafInput: # convert the paf input to lastz format, splitting out into primary and secondary files paf_to_lastz_job = head_job.addChildJobFn(paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID, True) cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0) cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1) if pafSecondaries else None # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc # and the cigar files match up. If reading cactus-blast output, the cigars are fine, just need # the fastas (todo: make this less hacky somehow) cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids, cactusWorkflowArguments, project, doRenaming, eventNameAsID, #todo disk= ) no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs cactusWorkflowArguments = cur_job.rv() if no_ingroup_coverage: # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage, cactusWorkflowArguments, project) cactusWorkflowArguments = cur_job.rv() # run cactus setup all the way through cactus2hal generation setup_job = cur_job.addFollowOnJobFn(run_setup_phase, cactusWorkflowArguments) # set up the project prepare_hal_export_job = setup_job.addFollowOnJobFn(run_prepare_hal_export, project, setup_job.rv()) # create the hal hal_export_job = prepare_hal_export_job.addFollowOnJobFn(exportHal, prepare_hal_export_job.rv(0), event=prepare_hal_export_job.rv(1), checkpointInfo=checkpointInfo, memory=configWrapper.getDefaultMemory(), disk=configWrapper.getExportHalDisk(), preemptable=False) # optionally create the VG if doVG or doGFA: vg_export_job = hal_export_job.addFollowOnJobFn(export_vg, hal_export_job.rv(), configWrapper, doVG, doGFA, checkpointInfo=checkpointInfo) vg_file_id, gfa_file_id = vg_export_job.rv(0), vg_export_job.rv(1) else: vg_file_id, gfa_file_id = None, None return hal_export_job.rv(), vg_file_id, gfa_file_id
def testServiceSerialization(self): """ Tests that a service can receive a promise without producing a serialization error. """ job = Job() service = ToySerializableService("woot") startValue = job.addService(service) # Add a first service to job subService = ToySerializableService(startValue) # Now create a child of # that service that takes the start value promise from the parent service job.addService(subService, parentService=service) # This should work if # serialization on services is working correctly. self.runToil(job)
def testConcurrencyWithDisk(self): """ Tests that the batch system is allocating disk resources properly """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir from toil import physicalDisk availableDisk = physicalDisk(options.workDir) logger.info('Testing disk concurrency limits with %s disk space', availableDisk) # More disk might become available by the time Toil starts, so we limit it here options.maxDisk = availableDisk options.batchSystem = self.batchSystemName counterPath = os.path.join(tempDir, 'counter') resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) half_disk = availableDisk // 2 more_than_half_disk = half_disk + 500 logger.info('Dividing into parts of %s and %s', half_disk, more_than_half_disk) root = Job() # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The # batchsystem should not allow the 2 child jobs to run concurrently. root.addChild( Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=half_disk)) root.addChild( Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=more_than_half_disk)) Job.Runner.startToil(root, options) _, maxValue = getCounters(counterPath) logger.info('After run: %s disk space', physicalDisk(options.workDir)) self.assertEqual(maxValue, 1)
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths): """ Surject all gam chunks in parallel. surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name. If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output. Surjects against the given collection of paths in the given XG file. """ RealtimeLogger.info( "Surjecting read chunks {} to BAM".format(reads_chunk_ids)) # this will be a list of lists. # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges) # for the ith gam chunk (generated from fastq shard i) bam_chunk_file_ids = [] bam_chunk_running_times = [] # to encapsulate everything under this job child_job = Job() job.addChild(child_job) for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)): #Run graph surject on each gam chunk chunk_surject_job = child_job.addChildJobFn( run_chunk_surject, context, interleaved, xg_file_id, paths, chunk_filename_ids, '{}_chunk{}'.format(output_name, chunk_id), cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) bam_chunk_file_ids.append(chunk_surject_job.rv(0)) bam_chunk_running_times.append(chunk_surject_job.rv(1)) return child_job.addFollowOnJobFn(run_merge_bams, context, output_name, bam_chunk_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project, doRenaming, pafInput): head_job = Job() job.addChild(head_job) # allow for input in paf format: if pafInput: # convert the paf input to lastz format, splitting out into primary and secondary files paf_to_lastz_job = head_job.addChildJobFn( paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID, True) cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0) cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1) # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc # and the cigar files match up. If reading cactus-blast output, the cigars are fine, just need # the fastas (todo: make this less hacky somehow) cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids, cactusWorkflowArguments, project, doRenaming #todo disk= ) no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs cactusWorkflowArguments = cur_job.rv() if no_ingroup_coverage: # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage, cactusWorkflowArguments, project) cactusWorkflowArguments = cur_job.rv() # run cactus setup all the way through cactus2hal generation setup_job = cur_job.addFollowOnJobFn(run_setup_phase, cactusWorkflowArguments) # set up the project prepare_hal_export_job = setup_job.addFollowOnJobFn( run_prepare_hal_export, project, setup_job.rv()) # create the hal hal_export_job = prepare_hal_export_job.addFollowOnJobFn( exportHal, prepare_hal_export_job.rv(0), event=prepare_hal_export_job.rv(1), memory=configWrapper.getDefaultMemory(), disk=configWrapper.getExportHalDisk(), preemptable=False) return hal_export_job.rv()
def createJobGraph(memory, cores, disk, preemptable, checkpoint): """Create a fake-ish Job and JobGraph pair, and return the jobGraph.""" name = 'jobGraph%d' % self.jobGraphNumber self.jobGraphNumber += 1 job = Job() job.checkpoint = checkpoint with self.jobStore.writeFileStream() as (f, fileStoreID): pickle.dump(job, f, pickle.HIGHEST_PROTOCOL) command = '_toil %s fooCommand toil True' % fileStoreID jobGraph = JobGraph(command=command, memory=memory, cores=cores, disk=disk, unitName=name, jobName=name, preemptable=preemptable, jobStoreID=name, remainingRetryCount=1, predecessorNumber=1) return self.jobStore.create(jobGraph)
def split_fas(job, seq_id_map, split_id_map): """ Use samtools to split a bunch of fasta files into reference contigs, using the output of rgfa-split as a guide""" root_job = Job() job.addChild(root_job) # map event name to dict of contgs. ex fa_contigs["CHM13"]["chr13"] = file_id fa_contigs = {} # we do each fasta in parallel for event in seq_id_map.keys(): fa_path, fa_id = seq_id_map[event] fa_contigs[event] = root_job.addChildJobFn(split_fa_into_contigs, event, fa_id, fa_path, split_id_map, disk=fa_id.size * 3).rv() return fa_contigs
def combine_splits(job, config, seq_id_map, original_id_map, remap_id_map): """ combine the output of two runs of gather_fas. the first is the contigs determined by minigraph, the second from remapping the ambigious contigs with minimap2 """ root_job = Job() job.addChild(root_job) # no ambiguous remappings, nothing to do if not remap_id_map or len(remap_id_map) == 0: return original_id_map amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # note: we're not handling case where 100% of a given reference contigs are ambiguous for ref_contig in original_id_map: if ref_contig == amb_name: # for ambiguous sequence, we overwrite and don't combine if ref_contig in remap_id_map: original_id_map[ref_contig] = remap_id_map[ref_contig] else: original_id_map[ref_contig] = None elif ref_contig in remap_id_map: total_size = 0 for event in original_id_map[ref_contig]['fa']: total_size += original_id_map[ref_contig]['fa'][event].size if event in remap_id_map[ref_contig]['fa']: total_size += remap_id_map[ref_contig]['fa'][event].size original_id_map[ref_contig] = root_job.addChildJobFn( combine_ref_contig_splits, original_id_map[ref_contig], remap_id_map[ref_contig], disk=total_size * 4).rv() return root_job.addFollowOnJobFn(combine_paf_splits, seq_id_map, original_id_map, remap_id_map, amb_name, graph_event).rv()
def preprocess_input_sequences(job, configWrapper, project, cactusWorkflowArguments, pafMaskFilter=None, referenceEvent=None): """ update the workflow arguments in place with unzipped version of any input fastas whose paths end in .gz, if there's a pafMaskFilter, softmasked regions are extracted from each sequence into a bed. Note that the beds will need unique ids prepended just like the fastas... """ head_job = Job() job.addChild(head_job) graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") exp = cactusWorkflowArguments.experimentWrapper ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()] mask_bed_ids = {} events = [] updated_seq_ids = [] for g, seqID in ingroupsAndOriginalIDs: zipped = project.inputSequenceMap[g].endswith('.gz') do_filter = pafMaskFilter and g not in [graph_event, referenceEvent] if zipped or do_filter: prepend_id_job = head_job.addChildJobFn( preprocess_input_sequence, g, seqID, project.inputSequenceMap[g], pafMaskFilter) updated_seq_id, mask_bed_id = prepend_id_job.rv( 0), prepend_id_job.rv(1) if zipped: events.append(g) updated_seq_ids.append(updated_seq_id) if do_filter: mask_bed_ids[g] = mask_bed_id return head_job.addFollowOnJobFn( resolve_id_promises, events, updated_seq_ids, cactusWorkflowArguments).rv(), mask_bed_ids
def testNestedResourcesDoNotBlock(self): """ Resources are requested in the order Memory > Cpu > Disk. Test that inavailability of cpus for one job that is scheduled does not block another job that can run. """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir options.maxCores = 4 from toil import physicalMemory availableMemory = physicalMemory() options.batchSystem = self.batchSystemName outFile = os.path.join(tempDir, 'counter') open(outFile, 'w').close() root = Job() blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b', cores=2, memory='1M', disk='1M') firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ', cores=1, memory='1M', disk='1M') secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10, writeVal='sJ', cores=1, memory='1M', disk='1M') # Should block off 50% of memory while waiting for it's 3 cores firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0, writeVal='fJC', cores=3, memory=int(old_div(availableMemory,2)), disk='1M') # These two shouldn't be able to run before B because there should be only # (50% of memory - 1M) available (firstJobChild should be blocking 50%) secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJC', cores=2, memory=int(old_div(availableMemory,1.5)), disk='1M') secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJGC', cores=2, memory=int(old_div(availableMemory,1.5)), disk='1M') root.addChild(blocker) root.addChild(firstJob) root.addChild(secondJob) firstJob.addChild(firstJobChild) secondJob.addChild(secondJobChild) secondJobChild.addChild(secondJobGrandChild) """ The tree is: root / | \ b fJ sJ | | fJC sJC | sJGC But the order of execution should be root > b , fJ, sJ > sJC > sJGC > fJC since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC should not block them, and should only run after they finish. """ Job.Runner.startToil(root, options) with open(outFile) as oFH: outString = oFH.read() # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same # time. We look for all possible permutations. possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])]) assert outString.startswith(possibleStarts) assert outString.endswith('sJCsJGCfJC')
def run_chunked_augmenting(job, context, graph_id, graph_basename, gam_id, gam_basename, batch_input=None, all_path_components=False, chunk_paths=[], connected_component_chunking=False, output_format=None, augment_gam=False, min_augment_coverage=None, expected_coverage=None, min_mapq=None, min_baseq=None, to_outstore=False): """ Run a chunking job (if desired), then augment the results """ # base case: only one input if batch_input is None: # chunk if necessary if all_path_components or connected_component_chunking or len( chunk_paths) > 1: child_job = Job() job.addChild(child_job) chunk_job = child_job.addChildJobFn( run_chunking, context, graph_id=graph_id, graph_basename=graph_basename, chunk_paths=chunk_paths, connected_component_chunking=connected_component_chunking, output_format=output_format, gam_id=gam_id, to_outstore=False, cores=context.config.chunk_cores, memory=context.config.chunk_mem, disk=context.config.chunk_disk) batch_input = chunk_job.rv() # recurse on chunks recurse_job = child_job.addFollowOnJobFn( run_chunked_augmenting, context, graph_id=None, graph_basename=None, gam_id=None, gam_basename=None, batch_input=batch_input, all_path_components=all_path_components, chunk_paths=chunk_paths, connected_component_chunking=connected_component_chunking, output_format=output_format, augment_gam=augment_gam, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, to_outstore=to_outstore) return recurse_job.rv() else: #phony up chunk output for single input batch_input = {'all': [graph_id, graph_basename]} if gam_id: batch_input['all'] += [gam_id, gam_basename] # run the augmenting on each chunk assert batch_input augment_results = [] for chunk_name, chunk_results in list(batch_input.items()): augment_job = job.addChildJobFn( run_augmenting, context, graph_id=chunk_results[0], graph_basename=chunk_results[1], gam_id=chunk_results[2], gam_basename=chunk_results[3], augment_gam=augment_gam, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, to_outstore=to_outstore, cores=context.config.augment_cores, memory=context.config.augment_mem, disk=context.config.augment_disk) augment_results.append((chunk_name, augment_job.rv())) return augment_results
def run_all_calling2(job, context, xg_file_id, chr_gam_ids, chr_gam_idx_ids, chroms, path_sizes, vcf_offsets, sample_name, genotype=False, out_name=None, recall=False, alt_gam_id=None, alt_gai_id=None, genotype_vcf_id=None, genotype_tbi_id=None, id_ranges_id=None, snarls_id=None, pack_support=False, old_call=False): """ Call all the chromosomes and return a merged up vcf/tbi pair """ # we make a child job so that all calling is encapsulated in a top-level job child_job = Job() job.addChild(child_job) vcf_ids = [] tbi_ids = [] call_timers_lists = [] assert len(chr_gam_ids) > 0 if not chr_gam_idx_ids: chr_gam_idx_ids = [None] * len(chr_gam_ids) if not chroms: chroms = [name for name in path_sizes.keys() if path_sizes[name] > 0] assert len(chr_gam_ids) == len(chr_gam_idx_ids) # id ranges deactivates path chunking if id_ranges_id: context.config.call_chunk_size = (2 << 30) - 1 context.config.overlap = 0 for i in range(len(chr_gam_ids)): alignment_file_id = chr_gam_ids[i] alignment_index_id = chr_gam_idx_ids[i] if len(chr_gam_ids) > 1: # 1 gam per chromosome chr_label = [chroms[i]] chr_offset = [vcf_offsets[i]] if vcf_offsets else [0] else: # single gam with one or more chromosomes chr_label = chroms chr_offset = vcf_offsets if vcf_offsets else [0] * len(chroms) chunk_job = child_job.addChildJobFn(run_chunking, context, xg_file_id, alignment_file_id, alignment_index_id, chr_label, chr_offset, path_sizes, sample_name, genotype=genotype, recall=recall, alt_gam_id=alt_gam_id, alt_gai_id=alt_gai_id, genotype_vcf_id=genotype_vcf_id, genotype_tbi_id=genotype_tbi_id, id_ranges_id=id_ranges_id, cores=context.config.call_chunk_cores, memory=context.config.call_chunk_mem, disk=context.config.call_chunk_disk) call_job = chunk_job.addFollowOnJobFn(run_chunked_calling, context, chunk_job.rv(0), genotype, recall, snarls_id, pack_support, old_call, chunk_job.rv(1), cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) vcf_ids.append(call_job.rv(0)) tbi_ids.append(call_job.rv(1)) call_timers_lists.append(call_job.rv(2)) if not out_name: out_name = sample_name return child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, vcf_ids, tbi_ids, write_to_outstore = True, call_timers_lists = call_timers_lists, cores=context.config.call_chunk_cores, memory=context.config.call_chunk_mem, disk=context.config.call_chunk_disk).rv()
def run_whole_alignment(job, context, fastq, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, indexes, reads_chunk_ids, bam_output=False, surject=False, gbwt_penalty=None, validate=False, fasta_dict_id=None): """ align all fastq chunks in parallel Takes a dict from index type to index file ID. Some indexes are extra and specifying them will change mapping behavior. Returns a list of per-contig GAMs, the total allignment runtime, and a list of per-contig BAM file IDs (which is only nonempty when surject is true). """ # this will be a list of lists. # gam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges) # for the ith gam chunk (generated from fastq shard i) gam_chunk_file_ids = [] gam_chunk_running_times = [] # depending on bam_output and surject options, we can make bam_output too bam_chunk_file_ids = [] # to encapsulate everything under this job child_job = Job() job.addChild(child_job) for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)): #Run graph alignment on each fastq chunk chunk_alignment_job = child_job.addChildJobFn(run_chunk_alignment, context, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, chunk_filename_ids, chunk_id, indexes, bam_output=bam_output, gbwt_penalty=gbwt_penalty, validate=validate, fasta_dict_id=fasta_dict_id, cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) if not bam_output: gam_chunk_file_ids.append(chunk_alignment_job.rv(0)) else: bam_chunk_file_ids.append(chunk_alignment_job.rv(0)) gam_chunk_running_times.append(chunk_alignment_job.rv(1)) if not bam_output: merge_gams_job = child_job.addFollowOnJobFn(run_merge_gams, context, sample_name, indexes.get('id_ranges'), gam_chunk_file_ids, gam_chunk_running_times, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) gam_chrom_ids = merge_gams_job.rv(0) gam_chunk_time = merge_gams_job.rv(1) bam_chrom_ids = [] else: gam_chrom_ids = [] gam_chunk_time = None merge_bams_job = child_job.addFollowOnJobFn(run_merge_bams, context, sample_name, bam_chunk_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) split_bams_job = merge_bams_job.addFollowOnJobFn(split_bam_into_chroms, context, indexes.get('id_ranges'), merge_bams_job.rv(), cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) bam_chrom_ids = split_bams_job.rv() if surject: interleaved_surject = interleaved or (fastq and len(fastq) == 2) zip_job = child_job.addFollowOnJobFn(run_zip_surject_input, context, gam_chunk_file_ids) xg_id = indexes['xg-surject'] if 'xg-surject' in indexes else indexes['xg'] bam_chrom_ids = [zip_job.addFollowOnJobFn(run_whole_surject, context, zip_job.rv(), sample_name + '-surject', interleaved_surject, xg_id, []).rv()] return gam_chrom_ids, gam_chunk_time, bam_chrom_ids
def get_plan(options, project, inSeqFile, outSeqFile, toil): plan = get_generation_info() + '\n' if options.wdl: plan += wdl_workflow_start(options, inSeqFile) options.pp_map = {} if options.toil: # kick things off with an empty job which we will hook subsequent jobs onto # (using RoundedJob because root job must be sublcass of Job, # https://github.com/ComparativeGenomicsToolkit/cactus/pull/284#issuecomment-684125478) start_job = RoundedJob() parent_job = start_job job_idx = {} # preprocessing plan += '\n## Preprocessor\n' leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()] for i in range(0, len(leaves), options.preprocessBatchSize): pre_batch = leaves[i:i+options.preprocessBatchSize] if options.wdl: plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, pre_batch) elif options.toil: job_idx[("preprocess", leaves[i])] = parent_job.addChildJobFn(toil_call_preprocess, options, inSeqFile, outSeqFile, leaves[i], cores=options.preprocessCores, memory=options.preprocessMemory, disk=options.preprocessDisk) else: plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format( get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch), options.cactusOptions, get_toil_resource_opts(options, 'preprocess')) if options.preprocessOnly: plan += '\n## Cactus\n' plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile, options.outHal, options.cactusOptions) return plan # shedule up the alignments schedule = Schedule() schedule.loadProject(project) schedule.compute() # set of all jobs, as genome names from the (fully resolved, output) seqfile events = set(outSeqFile.pathMap.keys()) - set(leaves) resolved = set(leaves) # convert follow-ons to dependencies follow_on_deps = {} for event in events: fo = schedule.followOn(event) if fo: follow_on_deps[fo] = event def get_deps(event): deps = set(schedule.deps(event)) if event in follow_on_deps: deps = deps.union(set(follow_on_deps[event])) # I don't know why the schedule doesn't always give the children # todo: understand! try: has_name = outSeqFile.tree.getNodeId(event) is not None except: has_name = False if has_name: for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)): if not outSeqFile.tree.isLeaf(node): deps.add(outSeqFile.tree.getName(node)) return deps events_and_virtuals = set(events) # add all events, potentially looping through virtual dependency chains # (hence the double loop) batch = set(events_and_virtuals) while len(batch) > 0: next_batch = set() for event in batch: for dep in get_deps(event): if dep not in events_and_virtuals: next_batch.add(dep) events_and_virtuals.add(dep) batch = next_batch # group jobs into rounds. where all jobs of round i can be run in parallel groups = [] while len(events_and_virtuals) > 0: group = [] to_remove = [] added = 0 for event in events_and_virtuals: if all([dep in resolved for dep in get_deps(event)]): if not schedule.isVirtual(event): group.append(event) to_remove.append(event) added += 1 if added == 0: sys.stderr.write("schedule deadlock:\n") for event in events_and_virtuals: sys.stderr.write("{} has deps {}\n".format(event, get_deps(event))) sys.exit(1) for tr in to_remove: resolved.add(tr) events_and_virtuals.remove(tr) groups.append(group) def halPath(event): if event == project.mcTree.getRootName(): return options.outHal else: return os.path.join(options.outDir, event + '.hal') def cigarPath(event): return os.path.join(options.outDir, event + '.cigar') # alignment groups plan += '\n## Alignment\n' for i, group in enumerate(groups): plan += '\n### Round {}'.format(i) if options.toil: # advance toil phase # todo: recapitulate exact dependencies parent_job = parent_job.addFollowOn(Job()) for event in sorted(group): plan += '\n' if options.wdl: plan += wdl_call_blast(options, project, event, cigarPath(event)) plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event]) elif options.toil: # promises only get fulfilleed if they are passed directly as arguments to the toil job, so we pull out the ones we need here leaf_deps, anc_deps = get_dep_names(options, project, event) fa_promises = [job_idx[("preprocess", dep)].rv() for dep in leaf_deps] + [job_idx[("align", dep)].rv(0) for dep in anc_deps] job_idx[("blast", event)] = parent_job.addChildJobFn(toil_call_blast, options, outSeqFile, project, event, cigarPath(event), leaf_deps + anc_deps, *fa_promises, cores=options.blastCores, memory=options.blastMemory, disk=options.preprocessDisk) job_idx[("align", event)] = job_idx[("blast", event)].addFollowOnJobFn(toil_call_align, options, outSeqFile, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event], job_idx[("blast", event)].rv(), leaf_deps + anc_deps, *fa_promises, cores=options.alignCores, memory=options.alignMemory, disk=options.alignDisk) else: # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle) plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'blast')) plan += 'cactus-align {} {} {} {} --root {} {} {} --database {}\n'.format( get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event, options.cactusOptions, get_toil_resource_opts(options, 'align'), options.database) # todo: just output the fasta in cactus-align. plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event]) # advance toil phase if options.toil: parent_job = parent_job.addFollowOn(Job()) # stitch together the final tree plan += '\n## HAL merging\n' root = project.mcTree.getRootName() prev_event = None append_count = 0 event_list = [] for group in reversed(groups): for event in group: if event != root: if options.wdl: plan += wdl_call_hal_append(options, project, event, prev_event) elif not options.toil: plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format( halPath(root), halPath(event), event, event, options.halOptions) append_count += 1 event_list.append(event) prev_event = event if options.toil: job_idx['hal_append'] = parent_job.addChildJobFn(toil_call_hal_append_subtrees, options, project, root, job_idx[('align', root)].rv(1), event_list, *[job_idx[('align', e)].rv(1) for e in event_list], cores=1, memory=options.alignMemory, disk=options.halAppendDisk) if options.wdl: plan += wdl_workflow_end(options, prev_event, append_count > 1) if options.toil: start_time = timeit.default_timer() toil.start(start_job) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-prepare-toil has finished after {} seconds".format(run_time)) return plan
def run_chunked_calling(job, context, chunk_infos, genotype, recall, snarls_id, pack_support, old_call, call_timers): """ spawn a calling job for each chunk then merge them together """ # to encapsulate everything under this job child_job = Job() job.addChild(child_job) path_names = set() # If no chunking and many paths, we augment once first and not before calling # so we don't waste resources augmenting the same graph again and again # Note: should only do this when len(chunk_infos) > 1, but leaving as is so the tests hit it! if context.config.call_chunk_size == 0: chunk_info = chunk_infos[0] augment_job = child_job.addChildJobFn( run_vg_call, context, chunk_info['sample'], chunk_info['vg_id'], chunk_info['gam_id'], xg_id = chunk_info['xg_id'], path_names = [chunk_info['chrom']], seq_names = [chunk_info['chrom']], seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']], seq_lengths = [chunk_info['path_size']], chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']), genotype = genotype, recall = recall, clip_info = chunk_info, augment_only = True, pack_support = pack_support, alt_gam_id = chunk_info['alt_gam_id'], old_call = old_call, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) augment_results = augment_job.rv() next_job = Job() augment_job.addFollowOn(next_job) child_job = next_job else: augment_results = None clip_file_ids = [] for chunk_info in chunk_infos: path_names.add(chunk_info['chrom']) # Run vg call call_job = child_job.addChildJobFn( run_vg_call, context, chunk_info['sample'], chunk_info['vg_id'], chunk_info['gam_id'], xg_id = chunk_info['xg_id'], path_names = [chunk_info['chrom']], seq_names = [chunk_info['chrom']], seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']], seq_lengths = [chunk_info['path_size']], chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']), genotype = genotype, recall = recall, clip_info = chunk_info, alt_gam_id = chunk_info['alt_gam_id'], genotype_vcf_id = chunk_info['genotype_vcf_id'], genotype_tbi_id = chunk_info['genotype_tbi_id'], snarls_id = snarls_id, pack_support = pack_support, old_call = old_call, augment_results = augment_results, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) vcf_id, call_timer = call_job.rv(0), call_job.rv(1) clip_file_ids.append(vcf_id) call_timers.append(call_timer) tag = list(path_names)[0] if len(path_names) == 1 else 'chroms' merge_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, tag, clip_file_ids, cores=context.config.call_chunk_cores, memory=context.config.call_chunk_mem, disk=context.config.call_chunk_disk) vcf_out_file_id = merge_job.rv(0) tbi_out_file_id = merge_job.rv(1) return vcf_out_file_id, tbi_out_file_id, call_timers