def testPromisedRequirementStatic(self): """ Asserts that promised core resources are allocated properly using a static DAG """ for coresPerJob in self.allocatedCores: tempDir = self._createTempDir('testFiles') counterPath = self.getCounterPath(tempDir) root = Job() one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M') thirtyTwoMb = Job.wrapFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M') root.addChild(one) root.addChild(thirtyTwoMb) for _ in range(self.cpuCount): root.addFollowOn( Job.wrapFn(batchSystemTest.measureConcurrency, counterPath, cores=PromisedRequirement( lambda x: x * coresPerJob, one.rv()), memory=PromisedRequirement( thirtyTwoMb.rv()), disk='1M')) Job.Runner.startToil(root, self.getOptions(tempDir)) _, maxValue = batchSystemTest.getCounters(counterPath) self.assertEqual(maxValue, self.cpuCount / coresPerJob)
def testConcurrencyWithDisk(self): """ Tests that the batch system is allocating disk resources properly """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir from toil import physicalDisk availableDisk = physicalDisk('', toilWorkflowDir=options.workDir) options.batchSystem = self.batchSystemName counterPath = os.path.join(tempDir, 'counter') resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) root = Job() # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The # batchsystem should not allow the 2 child jobs to run concurrently. root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=old_div(availableDisk,2))) root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=(old_div(availableDisk, 2)) + 500)) Job.Runner.startToil(root, options) _, maxValue = getCounters(counterPath) self.assertEqual(maxValue, 1)
def testConcurrencyWithDisk(self): """ Tests that the batch system is allocating disk resources properly """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir from toil import physicalDisk availableDisk = physicalDisk('', toilWorkflowDir=options.workDir) options.batchSystem = self.batchSystemName counterPath = os.path.join(tempDir, 'counter') resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) root = Job() # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The # batchsystem should not allow the 2 child jobs to run concurrently. root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=availableDisk/2)) root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=(availableDisk / 2) + 500)) Job.Runner.startToil(root, options) _, maxValue = getCounters(counterPath) self.assertEqual(maxValue, 1)
def testNewJobsCanHandleOtherJobDeaths(self): """ Create 2 non-local files and then create 2 jobs. The first job registers a deferred job to delete the second non-local file, deletes the first non-local file and then kills itself. The second job waits for the first file to be deleted, then sleeps for a few seconds and then spawns a child. the child of the second does nothing. However starting it should handle the untimely demise of the first job and run the registered deferred function that deletes the first file. We assert the absence of the two files at the end of the run. """ # There can be no retries self.options.retryCount = 0 workdir = self._createTempDir(purpose='nonLocalDir') nonLocalFile1 = os.path.join(workdir, str(uuid4())) nonLocalFile2 = os.path.join(workdir, str(uuid4())) open(nonLocalFile1, 'w').close() open(nonLocalFile2, 'w').close() assert os.path.exists(nonLocalFile1) assert os.path.exists(nonLocalFile2) files = [nonLocalFile1, nonLocalFile2] root = Job() A = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_A, files=files) B = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_B, files=files) C = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_C, files=files, expectedResult=False) root.addChild(A) root.addChild(B) B.addChild(C) try: Job.Runner.startToil(root, self.options) except FailedJobsException as e: pass
def testConcurrencyWithDisk(self): """ Tests that the batch system is allocating disk resources properly """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir from toil import physicalDisk availableDisk = physicalDisk(options.workDir) logger.info('Testing disk concurrency limits with %s disk space', availableDisk) # More disk might become available by the time Toil starts, so we limit it here options.maxDisk = availableDisk options.batchSystem = self.batchSystemName counterPath = os.path.join(tempDir, 'counter') resetCounters(counterPath) value, maxValue = getCounters(counterPath) assert (value, maxValue) == (0, 0) half_disk = availableDisk // 2 more_than_half_disk = half_disk + 500 logger.info('Dividing into parts of %s and %s', half_disk, more_than_half_disk) root = Job() # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The # batchsystem should not allow the 2 child jobs to run concurrently. root.addChild( Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=half_disk)) root.addChild( Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1, memory='1M', disk=more_than_half_disk)) Job.Runner.startToil(root, options) _, maxValue = getCounters(counterPath) logger.info('After run: %s disk space', physicalDisk(options.workDir)) self.assertEqual(maxValue, 1)
def testPromisedRequirementStatic(self): """ Asserts that promised core resources are allocated properly using a static DAG """ for coresPerJob in self.allocatedCores: tempDir = self._createTempDir('testFiles') counterPath = self.getCounterPath(tempDir) root = Job() one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M') thirtyTwoMb = Job.wrapFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M') root.addChild(one) root.addChild(thirtyTwoMb) for _ in range(self.cpuCount): root.addFollowOn(Job.wrapFn(batchSystemTest.measureConcurrency, counterPath, cores=PromisedRequirement(lambda x: x * coresPerJob, one.rv()), memory=PromisedRequirement(thirtyTwoMb.rv()), disk='1M')) Job.Runner.startToil(root, self.getOptions(tempDir)) _, maxValue = batchSystemTest.getCounters(counterPath) self.assertEqual(maxValue, self.cpuCount / coresPerJob)
def testNestedResourcesDoNotBlock(self): """ Resources are requested in the order Memory > Cpu > Disk. Test that inavailability of cpus for one job that is scheduled does not block another job that can run. """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir options.maxCores = 4 from toil import physicalMemory availableMemory = physicalMemory() options.batchSystem = self.batchSystemName outFile = os.path.join(tempDir, 'counter') open(outFile, 'w').close() root = Job() blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b', cores=2, memory='1M', disk='1M') firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ', cores=1, memory='1M', disk='1M') secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10, writeVal='sJ', cores=1, memory='1M', disk='1M') # Should block off 50% of memory while waiting for it's 3 cores firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0, writeVal='fJC', cores=3, memory=int(old_div(availableMemory,2)), disk='1M') # These two shouldn't be able to run before B because there should be only # (50% of memory - 1M) available (firstJobChild should be blocking 50%) secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJC', cores=2, memory=int(old_div(availableMemory,1.5)), disk='1M') secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJGC', cores=2, memory=int(old_div(availableMemory,1.5)), disk='1M') root.addChild(blocker) root.addChild(firstJob) root.addChild(secondJob) firstJob.addChild(firstJobChild) secondJob.addChild(secondJobChild) secondJobChild.addChild(secondJobGrandChild) """ The tree is: root / | \ b fJ sJ | | fJC sJC | sJGC But the order of execution should be root > b , fJ, sJ > sJC > sJGC > fJC since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC should not block them, and should only run after they finish. """ Job.Runner.startToil(root, options) with open(outFile) as oFH: outString = oFH.read() # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same # time. We look for all possible permutations. possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])]) assert outString.startswith(possibleStarts) assert outString.endswith('sJCsJGCfJC')
def testNestedResourcesDoNotBlock(self): """ Resources are requested in the order Memory > Cpu > Disk. Test that inavailability of cpus for one job that is scheduled does not block another job that can run. """ tempDir = self._createTempDir('testFiles') options = Job.Runner.getDefaultOptions(self._getTestJobStorePath()) options.workDir = tempDir options.maxCores = 4 from toil import physicalMemory availableMemory = physicalMemory() options.batchSystem = self.batchSystemName outFile = os.path.join(tempDir, 'counter') open(outFile, 'w').close() root = Job() blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b', cores=2, memory='1M', disk='1M') firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ', cores=1, memory='1M', disk='1M') secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10, writeVal='sJ', cores=1, memory='1M', disk='1M') # Should block off 50% of memory while waiting for it's 3 cores firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0, writeVal='fJC', cores=3, memory=int(availableMemory/2), disk='1M') # These two shouldn't be able to run before B because there should be only # (50% of memory - 1M) available (firstJobChild should be blocking 50%) secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJC', cores=2, memory=int(availableMemory/1.5), disk='1M') secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='sJGC', cores=2, memory=int(availableMemory/1.5), disk='1M') root.addChild(blocker) root.addChild(firstJob) root.addChild(secondJob) firstJob.addChild(firstJobChild) secondJob.addChild(secondJobChild) secondJobChild.addChild(secondJobGrandChild) """ The tree is: root / | \ b fJ sJ | | fJC sJC | sJGC But the order of execution should be root > b , fJ, sJ > sJC > sJGC > fJC since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC should not block them, and should only run after they finish. """ Job.Runner.startToil(root, options) with open(outFile) as oFH: outString = oFH.read() # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same # time. We look for all possible permutations. possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])]) assert outString.startswith(possibleStarts) assert outString.endswith('sJCsJGCfJC')
def run_calleval(job, context, xg_ids, gam_ids, gam_idx_ids, bam_ids, bam_idx_ids, gam_names, bam_names, vcfeval_baseline_id, vcfeval_baseline_tbi_id, caller_fasta_id, vcfeval_fasta_id, bed_id, clip_only, call, sample_name, chroms, vcf_offsets, vcfeval_score_field, plot_sets, surject, interleaved, freebayes, platypus, happy, sveval, recall, min_sv_len, max_sv_len, sv_overlap, sv_region_overlap, normalize, ins_ref_len, del_min_rol, ins_seq_comp, min_mapq=0, min_baseq=0, min_augment_coverage=0): """ top-level call-eval function. Runs the caller on every gam, and freebayes on every bam. The resulting vcfs are put through vcfeval and the accuracies are tabulated in the output. Returns the output of run_calleval results, a list of condition names, a list of corresponding called VCF.gz and index ID pairs, and dicts of vcfeval and happy result dicts, by condition name and clipped/unclipped status. plot_sets is a data structure of collections of conditions to plot against each other, as produced by parse_plot_sets. """ # We store the name of each condition we run names = [] # And we build up these result lists in sync with the name list vcf_tbi_id_pairs = [] timing_results = [] # Here we accumulate vcf_eval comparison results in a dict by condition name, then clipping status ("clipped", "unclipped"). # Each contained dict is the output dict from run_vcfeval eval_results = collections.defaultdict(dict) # And here we store similarly the output dicts from run_happy happy_results = collections.defaultdict(dict) # And here we store similarly the output dicts from run_sveval sveval_results = collections.defaultdict(dict) # Some prep work (surjection and truth extraction) will happen under this head job head_job = Job() job.addChild(head_job) # Most of our work will run under this child job child_job = Job() head_job.addFollowOn(child_job) # We always extract a single-sample VCF from the truth, to save time # picking through all its samples multiple times over later. This should # also save memory. TODO: should we define a separate disk/memory requirement set? sample_extract_job = head_job.addChildJobFn(run_make_control_vcfs, context, vcfeval_baseline_id, 'baseline.vcf.gz', vcfeval_baseline_tbi_id, sample_name, pos_only = True, no_filter_if_sample_not_found = True, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk) truth_vcf_id = sample_extract_job.rv(0) truth_vcf_tbi_id = sample_extract_job.rv(1) if not gam_idx_ids: gam_idx_ids = [None] * len(gam_ids) assert len(gam_idx_ids) == len(gam_ids) if surject: # optionally surject all the gams into bams for xg_id, gam_name, gam_id in zip(xg_ids, gam_names, gam_ids): surject_job = head_job.addChildJobFn(run_surjecting, context, gam_id, gam_name + '-surject', interleaved, xg_id, chroms, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) bam_ids.append(surject_job.rv()) bam_idx_ids.append(None) bam_names.append(gam_name + '-surject') if bam_ids: for bam_id, bam_idx_id, bam_name in zip(bam_ids, bam_idx_ids, bam_names): if not bam_idx_id: bam_index_job = child_job.addChildJobFn(run_bam_index, context, bam_id, bam_name, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) sorted_bam_id = bam_index_job.rv(0) sorted_bam_idx_id = bam_index_job.rv(1) else: bam_index_job = Job() child_job.addChild(bam_index_job) sorted_bam_id = bam_id sorted_bam_idx_id = bam_idx_id bam_caller_infos = [] if freebayes: bam_caller_infos.append(('freebayes', ['--genotype-qualities'], '-fb')) if platypus: bam_caller_infos.append(('platypus', ['--mergeClusteredVariants=1'], '-plat')) for bam_caller, bam_caller_opts, bam_caller_tag in bam_caller_infos: bam_caller_out_name = '{}{}'.format(bam_name, bam_caller_tag) bam_caller_job = bam_index_job.addFollowOnJobFn(run_all_bam_caller, context, caller_fasta_id, sorted_bam_id, sorted_bam_idx_id, sample_name, chroms, vcf_offsets, out_name = bam_caller_out_name, bam_caller = bam_caller, bam_caller_opts = bam_caller_opts, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) bam_caller_vcf_tbi_id_pair = (bam_caller_job.rv(0), bam_caller_job.rv(1)) timing_result = bam_caller_job.rv(2) if bed_id: eval_results[bam_caller_out_name]["clipped"] = \ bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name, score_field='GQ', cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if happy: happy_results[bam_caller_out_name]["clipped"] = \ bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if sveval: sveval_results[bam_caller_out_name]["clipped"] = \ bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, bed_id=bed_id, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, out_name=bam_caller_out_name, fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if not clip_only: # Also do unclipped eval_results[bam_caller_out_name]["unclipped"] = \ bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped', score_field='GQ', cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if happy: happy_results[bam_caller_out_name]["unclipped"] = \ bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped', cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if sveval: sveval_results[bam_caller_out_name]["unclipped"] = \ bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, bed_id=None, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped', fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() vcf_tbi_id_pairs.append(bam_caller_vcf_tbi_id_pair) timing_results.append(timing_result) names.append(bam_caller_out_name) if gam_ids: for gam_id, gam_idx_id, gam_name, xg_id in zip(gam_ids, gam_idx_ids, gam_names, xg_ids): if call: out_name = '{}{}'.format(gam_name, '-call') if context.config.filter_opts: filter_job = Job.wrapJobFn(run_filtering, context, graph_id=xg_id, graph_basename = 'graph.xg', gam_id=gam_id, gam_basename = 'aln.gam', filter_opts = context.config.filter_opts, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) gam_id = filter_job.rv() call_job = Job.wrapJobFn(run_chunked_calling, context, graph_id=xg_id, graph_basename='graph.xg', gam_id=gam_id, gam_basename='aln.gam', batch_input=None, snarls_id=None, genotype_vcf_id=None, genotype_tbi_id=None, sample=sample_name, augment=not recall, connected_component_chunking=False, output_format='pg', min_augment_coverage=min_augment_coverage, expected_coverage=None, min_mapq=min_mapq, min_baseq=min_baseq, ref_paths=chroms, ref_path_chunking=False, min_call_support=None, vcf_offsets=vcf_offsets, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) if context.config.filter_opts: child_job.addChild(filter_job) filter_job.addFollowOn(call_job) else: child_job.addChild(call_job) vcf_tbi_id_pair = (call_job.rv(0), call_job.rv(1)) #timing_result = call_job.rv(2) timing_result = TimeTracker() if not vcfeval_score_field: score_field = 'QUAL' else: score_field = vcfeval_score_field if bed_id: eval_results[out_name]["clipped"] = \ call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=out_name, score_field=score_field).rv() if happy: happy_results[out_name]["clipped"] = \ call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=out_name).rv() if sveval: sveval_results[out_name]["clipped"] = \ call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, bed_id = bed_id, out_name=out_name, fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize).rv() if not clip_only: # Also do unclipped eval_results[out_name]["unclipped"] = \ call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=out_name if not bed_id else out_name + '-unclipped', score_field=score_field).rv() if happy: happy_results[out_name]["unclipped"] = \ call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=out_name if not bed_id else out_name + '-unclipped').rv() if sveval: sveval_results[out_name]["unclipped"] = \ call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, bed_id = None, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, out_name=out_name if not bed_id else out_name + '-unclipped', fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize).rv() vcf_tbi_id_pairs.append(vcf_tbi_id_pair) timing_results.append(timing_result) names.append(out_name) calleval_results = child_job.addFollowOnJobFn(run_calleval_results, context, names, vcf_tbi_id_pairs, eval_results, happy_results, sveval_results, timing_results, plot_sets, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() return calleval_results, names, vcf_tbi_id_pairs, eval_results
from toil.common import Toil from toil.job import Job if __name__ == "__main__": # A is a job with children and follow-ons, for example: A = Job() A.addChild(Job()) A.addFollowOn(Job()) # B is a job which needs to run after A and its successors B = Job() # The way to do this without encapsulation is to make a parent of A, Ap, and make B a follow-on of Ap. Ap = Job() Ap.addChild(A) Ap.addFollowOn(B) options = Job.Runner.getDefaultOptions("./toilWorkflowRun") options.logLevel = "INFO" options.clean = "always" with Toil(options) as toil: print(toil.start(Ap))
def run_chunked_calling(job, context, graph_id, graph_basename, gam_id, gam_basename, batch_input=None, snarls_id=None, genotype_vcf_id=None, genotype_tbi_id=None, sample=None, augment=False, connected_component_chunking=False, output_format=None, min_augment_coverage=None, expected_coverage=None, min_mapq=None, min_baseq=None, ref_paths=[], ref_path_chunking=True, min_call_support=None, vcf_offsets={}, gam_chunking=False): # simple way to keep follow-ons down the tree child_job = Job() job.addChild(child_job) out_vcf_name = remove_ext(graph_basename) if sample: out_vcf_name += '_' + sample # base case: only one input if batch_input is None: # chunk if necessary if connected_component_chunking or ref_path_chunking: chunk_job = child_job.addChildJobFn( run_chunking, context, graph_id=graph_id, graph_basename=graph_basename, chunk_paths=ref_paths, connected_component_chunking=connected_component_chunking, output_format=output_format, gam_id=gam_id if gam_chunking else None, to_outstore=False, cores=context.config.chunk_cores, memory=context.config.chunk_mem, disk=context.config.chunk_disk) batch_input = chunk_job.rv() # recurse on chunks recurse_job = child_job.addFollowOnJobFn( run_chunked_calling, context, graph_id=None, graph_basename=graph_basename, gam_id=gam_id, gam_basename=gam_basename, batch_input=batch_input, snarls_id=snarls_id, genotype_vcf_id=genotype_vcf_id, genotype_tbi_id=genotype_tbi_id, sample=sample, augment=augment, connected_component_chunking=connected_component_chunking, output_format=output_format, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, ref_paths=ref_paths, ref_path_chunking=ref_path_chunking, min_call_support=min_call_support, vcf_offsets=vcf_offsets, gam_chunking=gam_chunking) return recurse_job.rv() else: # convert if we're augmenting and not chunking if augment and os.path.splitext( graph_basename)[1] != '.' + output_format: convert_job = child_job.addChildJobFn( run_convert, context, graph_id=graph_id, graph_basename=graph_basename, output_format=output_format, disk=context.config.calling_disk) graph_id = convert_job.rv() graph_basename = os.path.splitext( graph_basename)[0] + '.' + output_format # todo: clean up next_job = Job() child_job.addFollowOn(next_job) child_job = next_job #phony up chunk output for single input batch_input = {'all': [graph_id, graph_basename]} if gam_id: batch_input['all'] += [gam_id, gam_basename] # run the calling on each chunk assert batch_input call_results = [] in_gam_id = gam_id in_gam_basename = gam_basename for chunk_name, chunk_results in list(batch_input.items()): calling_root_job = Job() child_job.addChild(calling_root_job) graph_id = chunk_results[0] graph_basename = chunk_results[1] if gam_chunking: gam_id = chunk_results[2] gam_basename = chunk_results[3] else: gam_id = in_gam_id gam_basename = in_gam_basename if augment: augment_job = calling_root_job.addChildJobFn( run_augmenting, context, graph_id=graph_id, graph_basename=graph_basename, gam_id=gam_id, gam_basename=gam_basename, augment_gam=True, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, to_outstore=True, cores=context.config.augment_cores, memory=context.config.augment_mem, disk=context.config.augment_disk) graph_id = augment_job.rv(0) graph_basename = os.path.splitext(graph_basename)[ 0] + '-aug' + os.path.splitext(graph_basename)[1] gam_id = augment_job.rv(1) gam_basename = os.path.splitext( gam_basename)[0] + '-aug' + os.path.splitext(gam_basename)[1] # When path chunking, we subset our reference paths down to the current path if ref_path_chunking: ref_path = [chunk_name] else: ref_path = ref_paths calling_job = calling_root_job.addFollowOnJobFn( run_calling, context, graph_id=graph_id, graph_basename=graph_basename, gam_id=gam_id, gam_basename=gam_basename, snarls_id=snarls_id, genotype_vcf_id=genotype_vcf_id, genotype_tbi_id=genotype_tbi_id, sample=sample, expected_coverage=expected_coverage, min_mapq=min_mapq, ref_paths=ref_path, min_call_support=min_call_support, vcf_offsets=vcf_offsets, to_outstore=False, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) call_results.append((chunk_name, calling_job.rv())) concat_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name=out_vcf_name, vcf_ids=None, tbi_ids=None, write_to_outstore=True, call_timers_lists=[], batch_data=call_results) return concat_job.rv()
from toil.common import Toil from toil.job import Job if __name__ == "__main__": # A A = Job() A.addChild(Job()) A.addFollowOn(Job()) # Encapsulate A A = A.encapsulate() # B is a job which needs to run after A and its successors B = Job() # With encapsulation A and its successor subgraph appear to be a single job, hence: A.addChild(B) options = Job.Runner.getDefaultOptions("./toilWorkflowRun") options.logLevel = "INFO" options.clean = "always" with Toil(options) as toil: print(toil.start(A))