예제 #1
0
        def testPromisedRequirementStatic(self):
            """
            Asserts that promised core resources are allocated properly using a static DAG
            """
            for coresPerJob in self.allocatedCores:
                tempDir = self._createTempDir('testFiles')
                counterPath = self.getCounterPath(tempDir)

                root = Job()
                one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M')
                thirtyTwoMb = Job.wrapFn(getThirtyTwoMb,
                                         cores=0.1,
                                         memory='32M',
                                         disk='1M')
                root.addChild(one)
                root.addChild(thirtyTwoMb)
                for _ in range(self.cpuCount):
                    root.addFollowOn(
                        Job.wrapFn(batchSystemTest.measureConcurrency,
                                   counterPath,
                                   cores=PromisedRequirement(
                                       lambda x: x * coresPerJob, one.rv()),
                                   memory=PromisedRequirement(
                                       thirtyTwoMb.rv()),
                                   disk='1M'))
                Job.Runner.startToil(root, self.getOptions(tempDir))
                _, maxValue = batchSystemTest.getCounters(counterPath)
                self.assertEqual(maxValue, self.cpuCount / coresPerJob)
예제 #2
0
    def testConcurrencyWithDisk(self):
        """
        Tests that the batch system is allocating disk resources properly
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        from toil import physicalDisk
        availableDisk = physicalDisk('', toilWorkflowDir=options.workDir)
        options.batchSystem = self.batchSystemName

        counterPath = os.path.join(tempDir, 'counter')
        resetCounters(counterPath)
        value, maxValue = getCounters(counterPath)
        assert (value, maxValue) == (0, 0)

        root = Job()
        # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The
        # batchsystem should not allow the 2 child jobs to run concurrently.
        root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1,
                                    memory='1M', disk=old_div(availableDisk,2)))
        root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1,
                                 memory='1M', disk=(old_div(availableDisk, 2)) + 500))
        Job.Runner.startToil(root, options)
        _, maxValue = getCounters(counterPath)
        self.assertEqual(maxValue, 1)
예제 #3
0
    def testConcurrencyWithDisk(self):
        """
        Tests that the batch system is allocating disk resources properly
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        from toil import physicalDisk
        availableDisk = physicalDisk('', toilWorkflowDir=options.workDir)
        options.batchSystem = self.batchSystemName

        counterPath = os.path.join(tempDir, 'counter')
        resetCounters(counterPath)
        value, maxValue = getCounters(counterPath)
        assert (value, maxValue) == (0, 0)

        root = Job()
        # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The
        # batchsystem should not allow the 2 child jobs to run concurrently.
        root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1,
                                    memory='1M', disk=availableDisk/2))
        root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1,
                                 memory='1M', disk=(availableDisk / 2) + 500))
        Job.Runner.startToil(root, options)
        _, maxValue = getCounters(counterPath)
        self.assertEqual(maxValue, 1)
예제 #4
0
 def testNewJobsCanHandleOtherJobDeaths(self):
     """
     Create 2 non-local files and then create 2 jobs. The first job registers a deferred job
     to delete the second non-local file, deletes the first non-local file and then kills
     itself.  The second job waits for the first file to be deleted, then sleeps for a few
     seconds and then spawns a child. the child of the second does nothing. However starting
     it should handle the untimely demise of the first job and run the registered deferred
     function that deletes the first file.  We assert the absence of the two files at the
     end of the run.
     """
     # There can be no retries
     self.options.retryCount = 0
     workdir = self._createTempDir(purpose='nonLocalDir')
     nonLocalFile1 = os.path.join(workdir, str(uuid4()))
     nonLocalFile2 = os.path.join(workdir, str(uuid4()))
     open(nonLocalFile1, 'w').close()
     open(nonLocalFile2, 'w').close()
     assert os.path.exists(nonLocalFile1)
     assert os.path.exists(nonLocalFile2)
     files = [nonLocalFile1, nonLocalFile2]
     root = Job()
     A = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_A, files=files)
     B = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_B, files=files)
     C = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_C,
                       files=files,
                       expectedResult=False)
     root.addChild(A)
     root.addChild(B)
     B.addChild(C)
     try:
         Job.Runner.startToil(root, self.options)
     except FailedJobsException as e:
         pass
예제 #5
0
    def testConcurrencyWithDisk(self):
        """
        Tests that the batch system is allocating disk resources properly
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        from toil import physicalDisk
        availableDisk = physicalDisk(options.workDir)
        logger.info('Testing disk concurrency limits with %s disk space',
                    availableDisk)
        # More disk might become available by the time Toil starts, so we limit it here
        options.maxDisk = availableDisk
        options.batchSystem = self.batchSystemName

        counterPath = os.path.join(tempDir, 'counter')
        resetCounters(counterPath)
        value, maxValue = getCounters(counterPath)
        assert (value, maxValue) == (0, 0)

        half_disk = availableDisk // 2
        more_than_half_disk = half_disk + 500
        logger.info('Dividing into parts of %s and %s', half_disk,
                    more_than_half_disk)

        root = Job()
        # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The
        # batchsystem should not allow the 2 child jobs to run concurrently.
        root.addChild(
            Job.wrapFn(measureConcurrency,
                       counterPath,
                       self.sleepTime,
                       cores=1,
                       memory='1M',
                       disk=half_disk))
        root.addChild(
            Job.wrapFn(measureConcurrency,
                       counterPath,
                       self.sleepTime,
                       cores=1,
                       memory='1M',
                       disk=more_than_half_disk))
        Job.Runner.startToil(root, options)
        _, maxValue = getCounters(counterPath)

        logger.info('After run: %s disk space', physicalDisk(options.workDir))

        self.assertEqual(maxValue, 1)
        def testPromisedRequirementStatic(self):
            """
            Asserts that promised core resources are allocated properly using a static DAG
            """
            for coresPerJob in self.allocatedCores:
                tempDir = self._createTempDir('testFiles')
                counterPath = self.getCounterPath(tempDir)

                root = Job()
                one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M')
                thirtyTwoMb = Job.wrapFn(getThirtyTwoMb, cores=0.1, memory='32M', disk='1M')
                root.addChild(one)
                root.addChild(thirtyTwoMb)
                for _ in range(self.cpuCount):
                    root.addFollowOn(Job.wrapFn(batchSystemTest.measureConcurrency, counterPath,
                                                cores=PromisedRequirement(lambda x: x * coresPerJob, one.rv()),
                                                memory=PromisedRequirement(thirtyTwoMb.rv()),
                                                disk='1M'))
                Job.Runner.startToil(root, self.getOptions(tempDir))
                _, maxValue = batchSystemTest.getCounters(counterPath)
                self.assertEqual(maxValue, self.cpuCount / coresPerJob)
예제 #7
0
    def testNestedResourcesDoNotBlock(self):
        """
        Resources are requested in the order Memory > Cpu > Disk.
        Test that inavailability of cpus for one job that is scheduled does not block another job
        that can run.
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        options.maxCores = 4
        from toil import physicalMemory
        availableMemory = physicalMemory()
        options.batchSystem = self.batchSystemName

        outFile = os.path.join(tempDir, 'counter')
        open(outFile, 'w').close()

        root = Job()

        blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b',
                             cores=2, memory='1M', disk='1M')
        firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ',
                              cores=1, memory='1M', disk='1M')
        secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10,
                               writeVal='sJ', cores=1, memory='1M', disk='1M')

        # Should block off 50% of memory while waiting for it's 3 cores
        firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0,
                                   writeVal='fJC', cores=3, memory=int(old_div(availableMemory,2)), disk='1M')

        # These two shouldn't be able to run before B because there should be only
        # (50% of memory - 1M) available (firstJobChild should be blocking 50%)
        secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                    writeVal='sJC', cores=2, memory=int(old_div(availableMemory,1.5)),
                                    disk='1M')
        secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                         writeVal='sJGC', cores=2, memory=int(old_div(availableMemory,1.5)),
                                         disk='1M')

        root.addChild(blocker)
        root.addChild(firstJob)
        root.addChild(secondJob)

        firstJob.addChild(firstJobChild)
        secondJob.addChild(secondJobChild)

        secondJobChild.addChild(secondJobGrandChild)
        """
        The tree is:
                    root
                  /   |   \
                 b    fJ   sJ
                      |    |
                      fJC  sJC
                           |
                           sJGC
        But the order of execution should be
        root > b , fJ, sJ > sJC > sJGC > fJC
        since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the
        resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC
        should not block them, and should only run after they finish.
        """
        Job.Runner.startToil(root, options)
        with open(outFile) as oFH:
            outString = oFH.read()
        # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same
        # time. We look for all possible permutations.
        possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])])
        assert outString.startswith(possibleStarts)
        assert outString.endswith('sJCsJGCfJC')
예제 #8
0
    def testNestedResourcesDoNotBlock(self):
        """
        Resources are requested in the order Memory > Cpu > Disk.
        Test that inavailability of cpus for one job that is scheduled does not block another job
        that can run.
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        options.maxCores = 4
        from toil import physicalMemory
        availableMemory = physicalMemory()
        options.batchSystem = self.batchSystemName

        outFile = os.path.join(tempDir, 'counter')
        open(outFile, 'w').close()

        root = Job()

        blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b',
                             cores=2, memory='1M', disk='1M')
        firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ',
                              cores=1, memory='1M', disk='1M')
        secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10,
                               writeVal='sJ', cores=1, memory='1M', disk='1M')

        # Should block off 50% of memory while waiting for it's 3 cores
        firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0,
                                   writeVal='fJC', cores=3, memory=int(availableMemory/2), disk='1M')

        # These two shouldn't be able to run before B because there should be only
        # (50% of memory - 1M) available (firstJobChild should be blocking 50%)
        secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                    writeVal='sJC', cores=2, memory=int(availableMemory/1.5),
                                    disk='1M')
        secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                         writeVal='sJGC', cores=2, memory=int(availableMemory/1.5),
                                         disk='1M')

        root.addChild(blocker)
        root.addChild(firstJob)
        root.addChild(secondJob)

        firstJob.addChild(firstJobChild)
        secondJob.addChild(secondJobChild)

        secondJobChild.addChild(secondJobGrandChild)
        """
        The tree is:
                    root
                  /   |   \
                 b    fJ   sJ
                      |    |
                      fJC  sJC
                           |
                           sJGC
        But the order of execution should be
        root > b , fJ, sJ > sJC > sJGC > fJC
        since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the
        resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC
        should not block them, and should only run after they finish.
        """
        Job.Runner.startToil(root, options)
        with open(outFile) as oFH:
            outString = oFH.read()
        # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same
        # time. We look for all possible permutations.
        possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])])
        assert outString.startswith(possibleStarts)
        assert outString.endswith('sJCsJGCfJC')
예제 #9
0
def run_calleval(job, context, xg_ids, gam_ids, gam_idx_ids, bam_ids, bam_idx_ids, gam_names, bam_names,
                 vcfeval_baseline_id, vcfeval_baseline_tbi_id, caller_fasta_id, vcfeval_fasta_id,
                 bed_id, clip_only, call, sample_name, chroms, vcf_offsets,
                 vcfeval_score_field, plot_sets, surject, interleaved,
                 freebayes, platypus, happy, sveval, recall, min_sv_len, max_sv_len, sv_overlap,
                 sv_region_overlap, normalize, ins_ref_len, del_min_rol, ins_seq_comp,
                 min_mapq=0, min_baseq=0, min_augment_coverage=0):
    """
    top-level call-eval function. Runs the caller on every
    gam, and freebayes on every bam. The resulting vcfs are put through
    vcfeval and the accuracies are tabulated in the output.
    
    Returns the output of run_calleval results, a list of condition names, a
    list of corresponding called VCF.gz and index ID pairs, and dicts of
    vcfeval and happy result dicts, by condition name and clipped/unclipped
    status.

    plot_sets is a data structure of collections of conditions to plot against
    each other, as produced by parse_plot_sets.
    
    """
    
    # We store the name of each condition we run
    names = []
    # And we build up these result lists in sync with the name list
    vcf_tbi_id_pairs = []
    timing_results = []
    
    # Here we accumulate vcf_eval comparison results in a dict by condition name, then clipping status ("clipped", "unclipped").
    # Each contained dict is the output dict from run_vcfeval
    eval_results = collections.defaultdict(dict)
    # And here we store similarly the output dicts from run_happy
    happy_results = collections.defaultdict(dict)
    # And here we store similarly the output dicts from run_sveval
    sveval_results = collections.defaultdict(dict)

    # Some prep work (surjection and truth extraction) will happen under this head job
    head_job = Job()
    job.addChild(head_job)

    # Most of our work will run under this child job
    child_job = Job()
    head_job.addFollowOn(child_job)
    
    
    # We always extract a single-sample VCF from the truth, to save time
    # picking through all its samples multiple times over later. This should
    # also save memory. TODO: should we define a separate disk/memory requirement set?
    sample_extract_job = head_job.addChildJobFn(run_make_control_vcfs, context, vcfeval_baseline_id, 'baseline.vcf.gz',
                                                vcfeval_baseline_tbi_id, sample_name, pos_only = True,
                                                no_filter_if_sample_not_found = True,
                                                cores=context.config.vcfeval_cores,
                                                memory=context.config.vcfeval_mem,
                                                disk=context.config.vcfeval_disk)

    truth_vcf_id = sample_extract_job.rv(0)
    truth_vcf_tbi_id = sample_extract_job.rv(1)

    if not gam_idx_ids:
        gam_idx_ids = [None] * len(gam_ids)
    assert len(gam_idx_ids) == len(gam_ids)
    
    if surject:
        # optionally surject all the gams into bams
        for xg_id, gam_name, gam_id in zip(xg_ids, gam_names, gam_ids):
            surject_job = head_job.addChildJobFn(run_surjecting, context, gam_id, gam_name + '-surject',
                                                 interleaved, xg_id, chroms, cores=context.config.misc_cores,
                                                 memory=context.config.misc_mem, disk=context.config.misc_disk)
            bam_ids.append(surject_job.rv())
            bam_idx_ids.append(None)
            bam_names.append(gam_name + '-surject')
    
    if bam_ids:
        for bam_id, bam_idx_id, bam_name in zip(bam_ids, bam_idx_ids, bam_names):
            if not bam_idx_id:
                bam_index_job = child_job.addChildJobFn(run_bam_index, context, bam_id, bam_name,
                                                        cores=context.config.calling_cores,
                                                        memory=context.config.calling_mem,
                                                        disk=context.config.calling_disk)
                sorted_bam_id = bam_index_job.rv(0)
                sorted_bam_idx_id = bam_index_job.rv(1)
            else:
                bam_index_job = Job()
                child_job.addChild(bam_index_job)
                sorted_bam_id = bam_id
                sorted_bam_idx_id = bam_idx_id                

            bam_caller_infos = []
            if freebayes:
                bam_caller_infos.append(('freebayes', ['--genotype-qualities'], '-fb'))
            if platypus:
                bam_caller_infos.append(('platypus', ['--mergeClusteredVariants=1'], '-plat'))
                
            for bam_caller, bam_caller_opts, bam_caller_tag in bam_caller_infos:

                bam_caller_out_name = '{}{}'.format(bam_name, bam_caller_tag)
                bam_caller_job = bam_index_job.addFollowOnJobFn(run_all_bam_caller, context, caller_fasta_id,
                                                                sorted_bam_id, sorted_bam_idx_id, sample_name,
                                                                chroms, vcf_offsets,
                                                                out_name = bam_caller_out_name,
                                                                bam_caller = bam_caller,
                                                                bam_caller_opts = bam_caller_opts,
                                                                cores=context.config.misc_cores,
                                                                memory=context.config.misc_mem,
                                                                disk=context.config.misc_disk)
                bam_caller_vcf_tbi_id_pair = (bam_caller_job.rv(0), bam_caller_job.rv(1))
                timing_result = bam_caller_job.rv(2)

                if bed_id:

                    eval_results[bam_caller_out_name]["clipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name,
                                                        score_field='GQ', cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()
                    if happy:
                        happy_results[bam_caller_out_name]["clipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name,
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()

                    if sveval:
                        sveval_results[bam_caller_out_name]["clipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id,
                                                        min_sv_len=min_sv_len,
                                                        max_sv_len=max_sv_len,
                                                        sv_overlap=sv_overlap,
                                                        sv_region_overlap=sv_region_overlap,
                                                        bed_id=bed_id,
                                                        ins_ref_len=ins_ref_len,
                                                        del_min_rol=del_min_rol,
                                                        ins_seq_comp=ins_seq_comp, 
                                                        out_name=bam_caller_out_name,
                                                        fasta_path = 'ref.fasta',
                                                        fasta_id = vcfeval_fasta_id,
                                                        normalize = normalize,
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()                    

                if not clip_only:
                    # Also do unclipped

                    eval_results[bam_caller_out_name]["unclipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, None,
                                                        out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped',
                                                        score_field='GQ', cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()
                    if happy:
                        happy_results[bam_caller_out_name]["unclipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, None,
                                                        out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped',
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()

                    if sveval:
                        sveval_results[bam_caller_out_name]["unclipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id,
                                                        min_sv_len=min_sv_len,
                                                        max_sv_len=max_sv_len,
                                                        sv_overlap=sv_overlap,
                                                        sv_region_overlap=sv_region_overlap,
                                                        bed_id=None,
                                                        ins_ref_len=ins_ref_len,
                                                        del_min_rol=del_min_rol,
                                                        ins_seq_comp=ins_seq_comp, 
                                                        out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped',
                                                        fasta_path = 'ref.fasta',
                                                        fasta_id = vcfeval_fasta_id,
                                                        normalize = normalize,
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()                        

                vcf_tbi_id_pairs.append(bam_caller_vcf_tbi_id_pair)
                timing_results.append(timing_result)
                names.append(bam_caller_out_name)

    if gam_ids:
        for gam_id, gam_idx_id, gam_name, xg_id in zip(gam_ids, gam_idx_ids, gam_names, xg_ids):
            if call:
                out_name = '{}{}'.format(gam_name, '-call')
                
                if context.config.filter_opts:
                    filter_job = Job.wrapJobFn(run_filtering, context,
                                               graph_id=xg_id,
                                               graph_basename = 'graph.xg',
                                               gam_id=gam_id,
                                               gam_basename = 'aln.gam',
                                               filter_opts = context.config.filter_opts,
                                               cores=context.config.calling_cores,
                                               memory=context.config.calling_mem,
                                               disk=context.config.calling_disk)
                    gam_id = filter_job.rv()
                
                call_job = Job.wrapJobFn(run_chunked_calling, context,
                                         graph_id=xg_id,
                                         graph_basename='graph.xg',
                                         gam_id=gam_id,
                                         gam_basename='aln.gam',
                                         batch_input=None,
                                         snarls_id=None,
                                         genotype_vcf_id=None,
                                         genotype_tbi_id=None,
                                         sample=sample_name,
                                         augment=not recall,
                                         connected_component_chunking=False,
                                         output_format='pg',
                                         min_augment_coverage=min_augment_coverage,
                                         expected_coverage=None,
                                         min_mapq=min_mapq,
                                         min_baseq=min_baseq,
                                         ref_paths=chroms,
                                         ref_path_chunking=False,
                                         min_call_support=None,
                                         vcf_offsets=vcf_offsets,
                                         cores=context.config.misc_cores,
                                         memory=context.config.misc_mem,
                                         disk=context.config.misc_disk)

                if context.config.filter_opts:
                    child_job.addChild(filter_job)
                    filter_job.addFollowOn(call_job)
                else:
                    child_job.addChild(call_job)
                    
                vcf_tbi_id_pair = (call_job.rv(0), call_job.rv(1))
                #timing_result = call_job.rv(2)
                timing_result = TimeTracker()

                if not vcfeval_score_field:
                    score_field = 'QUAL'
                else:
                    score_field = vcfeval_score_field

                if bed_id:
                    eval_results[out_name]["clipped"] = \
                        call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, bed_id, out_name=out_name,
                                                  score_field=score_field).rv()
                    if happy:
                        happy_results[out_name]["clipped"] = \
                        call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, bed_id, out_name=out_name).rv()

                    if sveval:
                        sveval_results[out_name]["clipped"] = \
                        call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id,
                                                  min_sv_len=min_sv_len,
                                                  max_sv_len=max_sv_len,
                                                  sv_overlap=sv_overlap,
                                                  sv_region_overlap=sv_region_overlap,
                                                  ins_ref_len=ins_ref_len,
                                                  del_min_rol=del_min_rol,
                                                  ins_seq_comp=ins_seq_comp, 
                                                  bed_id = bed_id, out_name=out_name,
                                                  fasta_path = 'ref.fasta',
                                                  fasta_id = vcfeval_fasta_id,
                                                  normalize = normalize).rv()

                if not clip_only:
                    # Also do unclipped
                    eval_results[out_name]["unclipped"] = \
                        call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, None,
                                                  out_name=out_name if not bed_id else out_name + '-unclipped',
                                                  score_field=score_field).rv()
                    if happy:
                        happy_results[out_name]["unclipped"] = \
                        call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, None,
                                                  out_name=out_name if not bed_id else out_name + '-unclipped').rv()

                    if sveval:
                        sveval_results[out_name]["unclipped"] = \
                        call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id,
                                                  min_sv_len=min_sv_len,
                                                  max_sv_len=max_sv_len,
                                                  sv_overlap=sv_overlap,
                                                  sv_region_overlap=sv_region_overlap,
                                                  bed_id = None,
                                                  ins_ref_len=ins_ref_len,
                                                  del_min_rol=del_min_rol,
                                                  ins_seq_comp=ins_seq_comp, 
                                                  out_name=out_name if not bed_id else out_name + '-unclipped',
                                                  fasta_path = 'ref.fasta',
                                                  fasta_id = vcfeval_fasta_id,
                                                  normalize = normalize).rv()
                            
                    
                vcf_tbi_id_pairs.append(vcf_tbi_id_pair)
                timing_results.append(timing_result)
                names.append(out_name)            


    calleval_results = child_job.addFollowOnJobFn(run_calleval_results, context, names,
                                                  vcf_tbi_id_pairs, eval_results, happy_results, sveval_results,
                                                  timing_results, plot_sets,
                                                  cores=context.config.misc_cores,
                                                  memory=context.config.misc_mem,
                                                  disk=context.config.misc_disk).rv()

    return calleval_results, names, vcf_tbi_id_pairs, eval_results
예제 #10
0
from toil.common import Toil
from toil.job import Job

if __name__ == "__main__":
    # A is a job with children and follow-ons, for example:
    A = Job()
    A.addChild(Job())
    A.addFollowOn(Job())

    # B is a job which needs to run after A and its successors
    B = Job()

    # The way to do this without encapsulation is to make a parent of A, Ap, and make B a follow-on of Ap.
    Ap = Job()
    Ap.addChild(A)
    Ap.addFollowOn(B)

    options = Job.Runner.getDefaultOptions("./toilWorkflowRun")
    options.logLevel = "INFO"
    options.clean = "always"

    with Toil(options) as toil:
        print(toil.start(Ap))
예제 #11
0
파일: vg_call.py 프로젝트: xchang1/toil-vg
def run_chunked_calling(job,
                        context,
                        graph_id,
                        graph_basename,
                        gam_id,
                        gam_basename,
                        batch_input=None,
                        snarls_id=None,
                        genotype_vcf_id=None,
                        genotype_tbi_id=None,
                        sample=None,
                        augment=False,
                        connected_component_chunking=False,
                        output_format=None,
                        min_augment_coverage=None,
                        expected_coverage=None,
                        min_mapq=None,
                        min_baseq=None,
                        ref_paths=[],
                        ref_path_chunking=True,
                        min_call_support=None,
                        vcf_offsets={},
                        gam_chunking=False):

    # simple way to keep follow-ons down the tree
    child_job = Job()
    job.addChild(child_job)

    out_vcf_name = remove_ext(graph_basename)
    if sample:
        out_vcf_name += '_' + sample

    # base case: only one input
    if batch_input is None:
        # chunk if necessary
        if connected_component_chunking or ref_path_chunking:

            chunk_job = child_job.addChildJobFn(
                run_chunking,
                context,
                graph_id=graph_id,
                graph_basename=graph_basename,
                chunk_paths=ref_paths,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                gam_id=gam_id if gam_chunking else None,
                to_outstore=False,
                cores=context.config.chunk_cores,
                memory=context.config.chunk_mem,
                disk=context.config.chunk_disk)

            batch_input = chunk_job.rv()

            # recurse on chunks
            recurse_job = child_job.addFollowOnJobFn(
                run_chunked_calling,
                context,
                graph_id=None,
                graph_basename=graph_basename,
                gam_id=gam_id,
                gam_basename=gam_basename,
                batch_input=batch_input,
                snarls_id=snarls_id,
                genotype_vcf_id=genotype_vcf_id,
                genotype_tbi_id=genotype_tbi_id,
                sample=sample,
                augment=augment,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                min_augment_coverage=min_augment_coverage,
                expected_coverage=expected_coverage,
                min_mapq=min_mapq,
                min_baseq=min_baseq,
                ref_paths=ref_paths,
                ref_path_chunking=ref_path_chunking,
                min_call_support=min_call_support,
                vcf_offsets=vcf_offsets,
                gam_chunking=gam_chunking)
            return recurse_job.rv()
        else:
            # convert if we're augmenting and not chunking
            if augment and os.path.splitext(
                    graph_basename)[1] != '.' + output_format:
                convert_job = child_job.addChildJobFn(
                    run_convert,
                    context,
                    graph_id=graph_id,
                    graph_basename=graph_basename,
                    output_format=output_format,
                    disk=context.config.calling_disk)
                graph_id = convert_job.rv()
                graph_basename = os.path.splitext(
                    graph_basename)[0] + '.' + output_format
                # todo: clean up
                next_job = Job()
                child_job.addFollowOn(next_job)
                child_job = next_job

            #phony up chunk output for single input
            batch_input = {'all': [graph_id, graph_basename]}
            if gam_id:
                batch_input['all'] += [gam_id, gam_basename]

    # run the calling on each chunk
    assert batch_input

    call_results = []
    in_gam_id = gam_id
    in_gam_basename = gam_basename
    for chunk_name, chunk_results in list(batch_input.items()):
        calling_root_job = Job()
        child_job.addChild(calling_root_job)

        graph_id = chunk_results[0]
        graph_basename = chunk_results[1]
        if gam_chunking:
            gam_id = chunk_results[2]
            gam_basename = chunk_results[3]
        else:
            gam_id = in_gam_id
            gam_basename = in_gam_basename

        if augment:
            augment_job = calling_root_job.addChildJobFn(
                run_augmenting,
                context,
                graph_id=graph_id,
                graph_basename=graph_basename,
                gam_id=gam_id,
                gam_basename=gam_basename,
                augment_gam=True,
                min_augment_coverage=min_augment_coverage,
                expected_coverage=expected_coverage,
                min_mapq=min_mapq,
                min_baseq=min_baseq,
                to_outstore=True,
                cores=context.config.augment_cores,
                memory=context.config.augment_mem,
                disk=context.config.augment_disk)
            graph_id = augment_job.rv(0)
            graph_basename = os.path.splitext(graph_basename)[
                0] + '-aug' + os.path.splitext(graph_basename)[1]
            gam_id = augment_job.rv(1)
            gam_basename = os.path.splitext(
                gam_basename)[0] + '-aug' + os.path.splitext(gam_basename)[1]

        # When path chunking, we subset our reference paths down to the current path
        if ref_path_chunking:
            ref_path = [chunk_name]
        else:
            ref_path = ref_paths

        calling_job = calling_root_job.addFollowOnJobFn(
            run_calling,
            context,
            graph_id=graph_id,
            graph_basename=graph_basename,
            gam_id=gam_id,
            gam_basename=gam_basename,
            snarls_id=snarls_id,
            genotype_vcf_id=genotype_vcf_id,
            genotype_tbi_id=genotype_tbi_id,
            sample=sample,
            expected_coverage=expected_coverage,
            min_mapq=min_mapq,
            ref_paths=ref_path,
            min_call_support=min_call_support,
            vcf_offsets=vcf_offsets,
            to_outstore=False,
            cores=context.config.calling_cores,
            memory=context.config.calling_mem,
            disk=context.config.calling_disk)

        call_results.append((chunk_name, calling_job.rv()))

    concat_job = child_job.addFollowOnJobFn(run_concat_vcfs,
                                            context,
                                            out_name=out_vcf_name,
                                            vcf_ids=None,
                                            tbi_ids=None,
                                            write_to_outstore=True,
                                            call_timers_lists=[],
                                            batch_data=call_results)

    return concat_job.rv()
예제 #12
0
from toil.common import Toil
from toil.job import Job

if __name__ == "__main__":
    # A
    A = Job()
    A.addChild(Job())
    A.addFollowOn(Job())

    # Encapsulate A
    A = A.encapsulate()

    # B is a job which needs to run after A and its successors
    B = Job()

    # With encapsulation A and its successor subgraph appear to be a single job, hence:
    A.addChild(B)

    options = Job.Runner.getDefaultOptions("./toilWorkflowRun")
    options.logLevel = "INFO"
    options.clean = "always"

    with Toil(options) as toil:
        print(toil.start(A))