예제 #1
0
def graphmap_join_workflow(job, options, config, vg_ids, hal_ids):

    root_job = Job()
    job.addChild(root_job)

    # run clip-vg on each input
    clipped_vg_ids = []
    for vg_path, vg_id in zip(options.vg, vg_ids):
        clip_job = root_job.addChildJobFn(clip_vg,
                                          options,
                                          config,
                                          vg_path,
                                          vg_id,
                                          disk=vg_id.size * 2,
                                          memory=vg_id.size * 4)
        clipped_vg_ids.append(clip_job.rv())

    # join the ids
    join_job = root_job.addFollowOnJobFn(join_vg,
                                         options,
                                         config,
                                         clipped_vg_ids,
                                         disk=sum([f.size for f in vg_ids]))
    clipped_vg_ids = join_job.rv()

    # make a gfa for each
    gfa_root_job = Job()
    join_job.addFollowOn(gfa_root_job)
    clipped_gfa_ids = []
    for i in range(len(options.vg)):
        vg_path = options.vg[i]
        clipped_id = join_job.rv(i)
        vg_id = vg_ids[i]
        gfa_job = gfa_root_job.addChildJobFn(vg_to_gfa,
                                             options,
                                             config,
                                             vg_path,
                                             clipped_id,
                                             disk=vg_id.size * 5)
        clipped_gfa_ids.append(gfa_job.rv())

    # merge up the gfas and make the various vg indexes
    gfa_merge_job = gfa_root_job.addFollowOnJobFn(
        vg_indexes,
        options,
        config,
        clipped_gfa_ids,
        cores=options.indexCores,
        disk=sum(f.size for f in vg_ids) * 5)

    if hal_ids:
        merge_hal_id = job.addChildJobFn(merge_hal,
                                         options,
                                         hal_ids,
                                         disk=sum(f.size
                                                  for f in hal_ids) * 2).rv()
    else:
        merge_hal_id = None

    return clipped_vg_ids, gfa_merge_job.rv(), merge_hal_id
예제 #2
0
def split_minimap_fallback(job, options, config, seqIDMap, output_id_map):
    """ take the output table from gather_fas, pull out the ambiguous sequences, remap them to the reference, and 
    add them to the events where possible"""

    # can't do anything without a reference
    if not options.reference:
        logger.info("Skipping minimap2 fallback as --reference was not specified")
        return None, None
    # todo: also skip if no ambgious sequences
    
    ref_path, ref_id = seqIDMap[options.reference]
    mm_mem = ref_id.size * 5
    if seqIDMap[options.reference][0].endswith('.gz'):
        mm_mem *= 4
    mm_index_job = job.addChildJobFn(minimap_index, ref_path, ref_id, disk=ref_id.size * 5, memory=mm_mem)
    mm_map_root_job = Job()
    mm_index_job.addFollowOn(mm_map_root_job)
    
    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_")

    if amb_name not in output_id_map:
        logger.info("Skipping minmap2 fallback as no ambigious sequences found")
        return None, None

    # map every ambgiuous sequence against the reference in parallel
    paf_ids = []
    ambiguous_seq_id_map = {}
    for event, fa_id in output_id_map[amb_name]['fa'].items():
        paf_job = mm_map_root_job.addChildJobFn(minimap_map, mm_index_job.rv(), event, fa_id, seqIDMap[event][0],
                                                disk=ref_id.size * 3, memory=mm_mem)
        paf_ids.append(paf_job.rv())
        ambiguous_seq_id_map[event] = (seqIDMap[event][0], fa_id)

    return paf_ids, ambiguous_seq_id_map
예제 #3
0
def run_all_bam_caller(job, context, fasta_file_id, bam_file_id, bam_idx_id,
                       sample_name, chroms, offsets, out_name, bam_caller,
                       bam_caller_opts = []):
    """
    run freebayes or platypus on a set of chromosomal regions.  this is done by sending each region to a 
    child job and farming off the entire input to each (ie not splitting the input)
    """
    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    fb_vcf_ids = []
    fb_tbi_ids = []
    fb_timers = []
    assert chroms
    if not offsets:
        offsets = [None] * len(chroms)
    for chrom, offset in zip(chroms, offsets):
        fb_job = child_job.addChildJobFn(run_bam_caller, context, fasta_file_id, bam_file_id, bam_idx_id,
                                         sample_name, chrom, offset, out_name, bam_caller, bam_caller_opts,
                                         memory=context.config.calling_mem,
                                         disk=context.config.calling_disk)
        fb_vcf_ids.append(fb_job.rv(0))
        fb_tbi_ids.append(fb_job.rv(1))
        fb_timers.append([fb_job.rv(2)])

    merge_vcf_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, fb_vcf_ids, fb_tbi_ids,
                                               write_to_outstore = True, call_timers_lists = fb_timers)
    return merge_vcf_job.rv()
예제 #4
0
        def testJobConcurrency(self):
            """
            Tests that the batch system is allocating core resources properly for concurrent tasks.
            """
            for coresPerJob in self.allocatedCores:
                tempDir = self._createTempDir('testFiles')
                options = self.getOptions(tempDir)

                counterPath = os.path.join(tempDir, 'counter')
                resetCounters(counterPath)
                value, maxValue = getCounters(counterPath)
                assert (value, maxValue) == (0, 0)

                root = Job()
                for _ in range(self.cpuCount):
                    root.addFollowOn(
                        Job.wrapFn(measureConcurrency,
                                   counterPath,
                                   self.sleepTime,
                                   cores=coresPerJob,
                                   memory='1M',
                                   disk='1Mi'))
                Job.Runner.startToil(root, options)
                _, maxValue = getCounters(counterPath)
                self.assertEqual(maxValue, old_div(self.cpuCount, coresPerJob))
예제 #5
0
        def testPromisedRequirementStatic(self):
            """
            Asserts that promised core resources are allocated properly using a static DAG
            """
            for coresPerJob in self.allocatedCores:
                tempDir = self._createTempDir('testFiles')
                counterPath = self.getCounterPath(tempDir)

                root = Job()
                one = Job.wrapFn(getOne, cores=0.1, memory='32M', disk='1M')
                thirtyTwoMb = Job.wrapFn(getThirtyTwoMb,
                                         cores=0.1,
                                         memory='32M',
                                         disk='1M')
                root.addChild(one)
                root.addChild(thirtyTwoMb)
                for _ in range(self.cpuCount):
                    root.addFollowOn(
                        Job.wrapFn(batchSystemTest.measureConcurrency,
                                   counterPath,
                                   cores=PromisedRequirement(
                                       lambda x: x * coresPerJob, one.rv()),
                                   memory=PromisedRequirement(
                                       thirtyTwoMb.rv()),
                                   disk='1M'))
                Job.Runner.startToil(root, self.getOptions(tempDir))
                _, maxValue = batchSystemTest.getCounters(counterPath)
                self.assertEqual(maxValue, self.cpuCount / coresPerJob)
예제 #6
0
def minigraph_map_all(job, config, gfa_id, fa_id_map):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    # do the mapping
    gaf_ids = []
    for event, fa_id in fa_id_map.items():
        RealtimeLogger.info("adding child event={} faid={} gfaid={}".format(
            event, fa_id, gfa_id))
        minigraph_map_job = top_job.addChildJobFn(minigraph_map_one,
                                                  config,
                                                  event,
                                                  fa_id,
                                                  gfa_id,
                                                  cores=1,
                                                  disk=5 *
                                                  (fa_id.size + gfa_id.size))
        gaf_ids.append(minigraph_map_job.rv())

    # convert to paf
    paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_ids)

    return paf_job.rv()
예제 #7
0
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved,
                   xg_file_id, paths):
    """ split the fastq, then surject each chunk.  returns outputgams, paired with total surject time
    (excluding toil-vg overhead such as transferring and splitting files )"""

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    if not context.config.single_reads_chunk:
        reads_chunk_ids = child_job.addChildJobFn(
            run_split_reads,
            context,
            None,
            'aln.gam',
            None, [gam_input_reads_id],
            cores=context.config.misc_cores,
            memory=context.config.misc_mem,
            disk=context.config.misc_disk).rv()
    else:
        RealtimeLogger.info(
            "Bypassing reads splitting because --single_reads_chunk enabled")
        reads_chunk_ids = [[r] for r in [gam_input_reads_id]]

    return child_job.addFollowOnJobFn(run_whole_surject,
                                      context,
                                      reads_chunk_ids,
                                      output_name,
                                      interleaved,
                                      xg_file_id,
                                      paths,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk).rv()
예제 #8
0
 def testNewJobsCanHandleOtherJobDeaths(self):
     """
     Create 2 non-local files and then create 2 jobs. The first job registers a deferred job
     to delete the second non-local file, deletes the first non-local file and then kills
     itself.  The second job waits for the first file to be deleted, then sleeps for a few
     seconds and then spawns a child. the child of the second does nothing. However starting
     it should handle the untimely demise of the first job and run the registered deferred
     function that deletes the first file.  We assert the absence of the two files at the
     end of the run.
     """
     # There can be no retries
     self.options.retryCount = 0
     workdir = self._createTempDir(purpose='nonLocalDir')
     nonLocalFile1 = os.path.join(workdir, str(uuid4()))
     nonLocalFile2 = os.path.join(workdir, str(uuid4()))
     open(nonLocalFile1, 'w').close()
     open(nonLocalFile2, 'w').close()
     assert os.path.exists(nonLocalFile1)
     assert os.path.exists(nonLocalFile2)
     files = [nonLocalFile1, nonLocalFile2]
     root = Job()
     A = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_A, files=files)
     B = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_B, files=files)
     C = Job.wrapJobFn(_testNewJobsCanHandleOtherJobDeaths_C,
                       files=files,
                       expectedResult=False)
     root.addChild(A)
     root.addChild(B)
     B.addChild(C)
     try:
         Job.Runner.startToil(root, self.options)
     except FailedJobsException as e:
         pass
예제 #9
0
    def testConcurrencyWithDisk(self):
        """
        Tests that the batch system is allocating disk resources properly
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        from toil import physicalDisk
        availableDisk = physicalDisk('', toilWorkflowDir=options.workDir)
        options.batchSystem = self.batchSystemName

        counterPath = os.path.join(tempDir, 'counter')
        resetCounters(counterPath)
        value, maxValue = getCounters(counterPath)
        assert (value, maxValue) == (0, 0)

        root = Job()
        # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The
        # batchsystem should not allow the 2 child jobs to run concurrently.
        root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1,
                                    memory='1M', disk=old_div(availableDisk,2)))
        root.addChild(Job.wrapFn(measureConcurrency, counterPath, self.sleepTime, cores=1,
                                 memory='1M', disk=(old_div(availableDisk, 2)) + 500))
        Job.Runner.startToil(root, options)
        _, maxValue = getCounters(counterPath)
        self.assertEqual(maxValue, 1)
예제 #10
0
파일: jobServiceTest.py 프로젝트: mr-c/toil
 def makeWorkflow():
     job = Job()
     r1 = job.addService(ToySerializableService("woot1"))
     r2 = job.addService(ToySerializableService("woot2"))
     r3 = job.addService(ToySerializableService("woot3"))
     job.addChildFn(fnTest, [r1, r2, r3], outFile)
     return job
예제 #11
0
 def addJob(self, jobShape, preemptable=False):
     """
     Add a job to the job queue
     """
     self.totalJobs += 1
     jobID = uuid.uuid4()
     self.jobBatchSystemIDToIssuedJob[jobID] = Job(memory=jobShape.memory,
                                                       cores=jobShape.cores, disk=jobShape.disk, preemptable=preemptable)
     self.jobQueue.put(jobID)
예제 #12
0
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "cpu",
                                 typeFn=int,
                                 default=1)
    mg_cores = min(mg_cores, cpu_count())

    # doing the paf conversion is more efficient when done separately for each genome.  we can get away
    # with doing this if the universal filter (which needs to process everything at once) is disabled
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter",
                                           float)

    # do the mapping
    gaf_id_map = {}
    paf_id_map = {}

    for event, fa_path_fa_id in fa_id_map.items():
        fa_path = fa_path_fa_id[0]
        fa_id = fa_path_fa_id[1]
        minigraph_map_job = top_job.addChildJobFn(
            minigraph_map_one,
            config,
            event,
            fa_path,
            fa_id,
            gfa_id,
            keep_gaf or not paf_per_genome,
            paf_per_genome,
            # todo: estimate RAM
            cores=mg_cores,
            disk=5 * (fa_id.size + gfa_id.size))
        gaf_id_map[event] = minigraph_map_job.rv(0)
        paf_id_map[event] = minigraph_map_job.rv(1)

    # convert to paf
    if paf_per_genome:
        paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map)
    else:
        paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config,
                                           gaf_id_map)

    if not keep_gaf:
        gaf_id_map = None
    else:
        gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv()

    return paf_job.rv(), gaf_id_map
예제 #13
0
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path, paf_id, paf_path, ref_contigs, other_contig):

    root_job = Job()
    job.addChild(root_job)

    # get the sizes before we overwrite below
    gfa_size = gfa_id.size
    paf_size = paf_id.size
    
    # use file extension to sniff out compressed input
    if gfa_path.endswith(".gz"):
        gfa_id = root_job.addChildJobFn(unzip_gz, gfa_path, gfa_id, disk=gfa_id.size * 10).rv()
        gfa_size *= 10
    if paf_path.endswith(".gz"):
        paf_id = root_job.addChildJobFn(unzip_gz, paf_path, paf_id, disk=paf_id.size * 10).rv()
        paf_size *= 10

    mask_bed_id = None
    if options.maskFilter:
        mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap, options.maskFilter).rv()
        
    # use rgfa-split to split the gfa and paf up by contig
    split_gfa_job = root_job.addFollowOnJobFn(split_gfa, config, gfa_id, [paf_id], ref_contigs,
                                              other_contig, options.reference, mask_bed_id,
                                              disk=(gfa_size + paf_size) * 5)

    # use the output of the above splitting to do the fasta splitting
    split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap, split_gfa_job.rv(0))

    # gather everythign up into a table
    gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap, split_gfa_job.rv(0), split_fas_job.rv())

    # try splitting the ambiguous sequences using minimap2, which is more sensitive in some cases
    remap_job = gather_fas_job.addFollowOnJobFn(split_minimap_fallback, options, config, seqIDMap, gather_fas_job.rv())

    # partition these into fasta files
    split_fallback_gfa_job = remap_job.addFollowOnJobFn(split_gfa, config, None, remap_job.rv(0), ref_contigs,
                                                        other_contig, options.reference, None,
                                                        disk=(gfa_size + paf_size) * 5)

    # use the output of the above to split the ambiguous fastas
    split_fallback_fas_job = split_fallback_gfa_job.addFollowOnJobFn(split_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0))

    # gather the fallback contigs into a table
    gather_fallback_fas_job = split_fallback_fas_job.addFollowOnJobFn(gather_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0),
                                                                      split_fallback_fas_job.rv())

    # combine the split sequences with the split ambigious sequences
    combine_split_job = gather_fallback_fas_job.addFollowOnJobFn(combine_splits, options, config, seqIDMap, gather_fas_job.rv(),
                                                                 gather_fallback_fas_job.rv())

    # return all the files, as well as the 2 split logs
    return (seqIDMap, combine_split_job.rv(), split_gfa_job.rv(1), split_fallback_gfa_job.rv(1))
예제 #14
0
def run_analysis(job, context, ped_file_id, cohort_vcf_id, maternal_bam_id,
                 maternal_bai_id, paternal_bam_id, paternal_bai_id,
                 sibling_bam_ids, sibling_bai_ids, sample_name, maternal_name,
                 paternal_name, sibling_names, sibling_genders,
                 sibling_affected, bypass, cadd_lines, chrom_dir, edit_dir,
                 split_lines, genome_build, cadd_data_dir):
    """ run vcf to shebang varsifter file conversion, then do cadd scoring and annotation, finally run the blackmagiktoolbox workflow.
        returns final candidate varsifter file, paired with total surject time
    (excluding toil-vg overhead such as transferring and splitting files )"""

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    mosaicism_detecting_job = child_job.addChildJobFn(
        run_detect_mosaicism,
        context,
        ped_file_id,
        cohort_vcf_id,
        sample_name,
        cores=context.config.misc_cores,
        memory=context.config.alignment_mem,
        disk=context.config.misc_disk)

    vcf_to_shebang_job = child_job.addChildJobFn(
        run_vcftoshebang,
        context,
        cohort_vcf_id,
        maternal_bam_id,
        maternal_bai_id,
        paternal_bam_id,
        paternal_bai_id,
        sibling_bam_ids,
        sibling_bai_ids,
        sample_name,
        maternal_name,
        paternal_name,
        sibling_names,
        sibling_genders,
        sibling_affected,
        bypass,
        cadd_lines,
        chrom_dir,
        edit_dir,
        split_lines,
        genome_build,
        cadd_data_dir,
        cores=context.config.alignment_cores,
        memory=context.config.alignment_mem,
        disk=context.config.alignment_disk)

    return vcf_to_shebang_job.rv(), mosaicism_detecting_job.rv()
예제 #15
0
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path,
                            paf_id, paf_path, ref_contigs, other_contig):

    root_job = Job()
    job.addChild(root_job)

    # get the sizes before we overwrite below
    gfa_size = gfa_id.size
    paf_size = paf_id.size

    # use file extension to sniff out compressed input
    if gfa_path.endswith(".gz"):
        gfa_id = root_job.addChildJobFn(unzip_gz,
                                        gfa_path,
                                        gfa_id,
                                        disk=gfa_id.size * 10).rv()
        gfa_size *= 10
    if paf_path.endswith(".gz"):
        paf_id = root_job.addChildJobFn(unzip_gz,
                                        paf_path,
                                        paf_id,
                                        disk=paf_id.size * 10).rv()
        paf_size *= 10

    mask_bed_id = None
    if options.maskFilter:
        mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap,
                                             options.maskFilter).rv()

    # use rgfa-split to split the gfa and paf up by contig
    split_gfa_job = root_job.addFollowOnJobFn(split_gfa,
                                              config,
                                              gfa_id,
                                              paf_id,
                                              ref_contigs,
                                              other_contig,
                                              options.reference,
                                              mask_bed_id,
                                              disk=(gfa_size + paf_size) * 5)

    # use the output of the above splitting to do the fasta splitting
    split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap,
                                                   split_gfa_job.rv())

    # gather everythign up into a table
    gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap,
                                                    split_gfa_job.rv(),
                                                    split_fas_job.rv())

    # return all the files
    return gather_fas_job.rv()
예제 #16
0
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project, checkpointInfo, doRenaming, pafInput, pafSecondaries, doVG, doGFA, delay=0, eventNameAsID=False):
    # this option (--stagger) can be used in batch mode to avoid starting all the alignment jobs at the same time
    time.sleep(delay)
    
    head_job = Job()
    job.addChild(head_job)

    # allow for input in paf format:
    if pafInput:
        # convert the paf input to lastz format, splitting out into primary and secondary files
        paf_to_lastz_job = head_job.addChildJobFn(paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID, True)
        cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0)
        cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1) if pafSecondaries else None

    # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc
    # and the cigar files match up.  If reading cactus-blast output, the cigars are fine, just need
    # the fastas (todo: make this less hacky somehow)
    cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids, cactusWorkflowArguments, project, doRenaming, eventNameAsID,
                                        #todo disk=
    )
    no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs
    cactusWorkflowArguments = cur_job.rv()
    
    if no_ingroup_coverage:
        # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage
        cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage, cactusWorkflowArguments, project)
        cactusWorkflowArguments = cur_job.rv()

    # run cactus setup all the way through cactus2hal generation
    setup_job = cur_job.addFollowOnJobFn(run_setup_phase, cactusWorkflowArguments)

    # set up the project
    prepare_hal_export_job = setup_job.addFollowOnJobFn(run_prepare_hal_export, project, setup_job.rv())

    # create the hal
    hal_export_job = prepare_hal_export_job.addFollowOnJobFn(exportHal, prepare_hal_export_job.rv(0), event=prepare_hal_export_job.rv(1),
                                                             checkpointInfo=checkpointInfo,
                                                             memory=configWrapper.getDefaultMemory(),
                                                             disk=configWrapper.getExportHalDisk(),
                                                             preemptable=False)

    # optionally create the VG
    if doVG or doGFA:
        vg_export_job = hal_export_job.addFollowOnJobFn(export_vg, hal_export_job.rv(), configWrapper, doVG, doGFA,
                                                        checkpointInfo=checkpointInfo)
        vg_file_id, gfa_file_id = vg_export_job.rv(0), vg_export_job.rv(1)
    else:
        vg_file_id, gfa_file_id = None, None
        
    return hal_export_job.rv(), vg_file_id, gfa_file_id
예제 #17
0
 def testServiceSerialization(self):
     """
     Tests that a service can receive a promise without producing a serialization
     error.
     """
     job = Job()
     service = ToySerializableService("woot")
     startValue = job.addService(service) # Add a first service to job
     subService = ToySerializableService(startValue) # Now create a child of 
     # that service that takes the start value promise from the parent service
     job.addService(subService, parentService=service) # This should work if
     # serialization on services is working correctly.
     
     self.runToil(job)
예제 #18
0
    def testConcurrencyWithDisk(self):
        """
        Tests that the batch system is allocating disk resources properly
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        from toil import physicalDisk
        availableDisk = physicalDisk(options.workDir)
        logger.info('Testing disk concurrency limits with %s disk space',
                    availableDisk)
        # More disk might become available by the time Toil starts, so we limit it here
        options.maxDisk = availableDisk
        options.batchSystem = self.batchSystemName

        counterPath = os.path.join(tempDir, 'counter')
        resetCounters(counterPath)
        value, maxValue = getCounters(counterPath)
        assert (value, maxValue) == (0, 0)

        half_disk = availableDisk // 2
        more_than_half_disk = half_disk + 500
        logger.info('Dividing into parts of %s and %s', half_disk,
                    more_than_half_disk)

        root = Job()
        # Physically, we're asking for 50% of disk and 50% of disk + 500bytes in the two jobs. The
        # batchsystem should not allow the 2 child jobs to run concurrently.
        root.addChild(
            Job.wrapFn(measureConcurrency,
                       counterPath,
                       self.sleepTime,
                       cores=1,
                       memory='1M',
                       disk=half_disk))
        root.addChild(
            Job.wrapFn(measureConcurrency,
                       counterPath,
                       self.sleepTime,
                       cores=1,
                       memory='1M',
                       disk=more_than_half_disk))
        Job.Runner.startToil(root, options)
        _, maxValue = getCounters(counterPath)

        logger.info('After run: %s disk space', physicalDisk(options.workDir))

        self.assertEqual(maxValue, 1)
예제 #19
0
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved,
                      xg_file_id, paths):
    """
    Surject all gam chunks in parallel.
    
    surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name.
    
    If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output.
    
    Surjects against the given collection of paths in the given XG file.
    
    """

    RealtimeLogger.info(
        "Surjecting read chunks {} to BAM".format(reads_chunk_ids))

    # this will be a list of lists.
    # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges)
    # for the ith gam chunk (generated from fastq shard i)
    bam_chunk_file_ids = []
    bam_chunk_running_times = []

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)):
        #Run graph surject on each gam chunk
        chunk_surject_job = child_job.addChildJobFn(
            run_chunk_surject,
            context,
            interleaved,
            xg_file_id,
            paths,
            chunk_filename_ids,
            '{}_chunk{}'.format(output_name, chunk_id),
            cores=context.config.alignment_cores,
            memory=context.config.alignment_mem,
            disk=context.config.alignment_disk)
        bam_chunk_file_ids.append(chunk_surject_job.rv(0))
        bam_chunk_running_times.append(chunk_surject_job.rv(1))

    return child_job.addFollowOnJobFn(run_merge_bams,
                                      context,
                                      output_name,
                                      bam_chunk_file_ids,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk).rv()
예제 #20
0
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project,
                     doRenaming, pafInput):
    head_job = Job()
    job.addChild(head_job)

    # allow for input in paf format:
    if pafInput:
        # convert the paf input to lastz format, splitting out into primary and secondary files
        paf_to_lastz_job = head_job.addChildJobFn(
            paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID,
            True)
        cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0)
        cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1)

    # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc
    # and the cigar files match up.  If reading cactus-blast output, the cigars are fine, just need
    # the fastas (todo: make this less hacky somehow)
    cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids,
                                        cactusWorkflowArguments, project,
                                        doRenaming
                                        #todo disk=
                                        )
    no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs
    cactusWorkflowArguments = cur_job.rv()

    if no_ingroup_coverage:
        # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage
        cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage,
                                           cactusWorkflowArguments, project)
        cactusWorkflowArguments = cur_job.rv()

    # run cactus setup all the way through cactus2hal generation
    setup_job = cur_job.addFollowOnJobFn(run_setup_phase,
                                         cactusWorkflowArguments)

    # set up the project
    prepare_hal_export_job = setup_job.addFollowOnJobFn(
        run_prepare_hal_export, project, setup_job.rv())

    # create the hal
    hal_export_job = prepare_hal_export_job.addFollowOnJobFn(
        exportHal,
        prepare_hal_export_job.rv(0),
        event=prepare_hal_export_job.rv(1),
        memory=configWrapper.getDefaultMemory(),
        disk=configWrapper.getExportHalDisk(),
        preemptable=False)
    return hal_export_job.rv()
예제 #21
0
        def createJobGraph(memory, cores, disk, preemptable, checkpoint):
            """Create a fake-ish Job and JobGraph pair, and return the
            jobGraph."""
            name = 'jobGraph%d' % self.jobGraphNumber
            self.jobGraphNumber += 1

            job = Job()
            job.checkpoint = checkpoint
            with self.jobStore.writeFileStream() as (f, fileStoreID):
                pickle.dump(job, f, pickle.HIGHEST_PROTOCOL)
            command = '_toil %s fooCommand toil True' % fileStoreID
            jobGraph = JobGraph(command=command, memory=memory, cores=cores,
                                disk=disk, unitName=name,
                                jobName=name, preemptable=preemptable,
                                jobStoreID=name, remainingRetryCount=1,
                                predecessorNumber=1)
            return self.jobStore.create(jobGraph)
예제 #22
0
def split_fas(job, seq_id_map, split_id_map):
    """ Use samtools to split a bunch of fasta files into reference contigs, using the output of rgfa-split as a guide"""

    root_job = Job()
    job.addChild(root_job)

    # map event name to dict of contgs.  ex fa_contigs["CHM13"]["chr13"] = file_id
    fa_contigs = {}
    # we do each fasta in parallel
    for event in seq_id_map.keys():
        fa_path, fa_id = seq_id_map[event]
        fa_contigs[event] = root_job.addChildJobFn(split_fa_into_contigs,
                                                   event,
                                                   fa_id,
                                                   fa_path,
                                                   split_id_map,
                                                   disk=fa_id.size * 3).rv()

    return fa_contigs
예제 #23
0
def combine_splits(job, config, seq_id_map, original_id_map, remap_id_map):
    """ combine the output of two runs of gather_fas.  the first is the contigs determined by minigraph,
    the second from remapping the ambigious contigs with minimap2 """

    root_job = Job()
    job.addChild(root_job)

    # no ambiguous remappings, nothing to do
    if not remap_id_map or len(remap_id_map) == 0:
        return original_id_map

    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")

    # note: we're not handling case where 100% of a given reference contigs are ambiguous
    for ref_contig in original_id_map:
        if ref_contig == amb_name:
            # for ambiguous sequence, we overwrite and don't combine
            if ref_contig in remap_id_map:
                original_id_map[ref_contig] = remap_id_map[ref_contig]
            else:
                original_id_map[ref_contig] = None
        elif ref_contig in remap_id_map:
            total_size = 0
            for event in original_id_map[ref_contig]['fa']:
                total_size += original_id_map[ref_contig]['fa'][event].size
                if event in remap_id_map[ref_contig]['fa']:
                    total_size += remap_id_map[ref_contig]['fa'][event].size
            original_id_map[ref_contig] = root_job.addChildJobFn(
                combine_ref_contig_splits,
                original_id_map[ref_contig],
                remap_id_map[ref_contig],
                disk=total_size * 4).rv()

    return root_job.addFollowOnJobFn(combine_paf_splits, seq_id_map,
                                     original_id_map, remap_id_map, amb_name,
                                     graph_event).rv()
예제 #24
0
def preprocess_input_sequences(job,
                               configWrapper,
                               project,
                               cactusWorkflowArguments,
                               pafMaskFilter=None,
                               referenceEvent=None):
    """ update the workflow arguments in place with unzipped version of any input fastas whose paths 
    end in .gz, 
    if there's a pafMaskFilter, softmasked regions are extracted from each sequence into a bed.
    Note that the beds will need unique ids prepended just like the fastas...
    """
    head_job = Job()
    job.addChild(head_job)
    graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g))
                              for g in exp.getGenomesWithSequence()
                              if g not in exp.getOutgroupGenomes()]
    mask_bed_ids = {}
    events = []
    updated_seq_ids = []
    for g, seqID in ingroupsAndOriginalIDs:
        zipped = project.inputSequenceMap[g].endswith('.gz')
        do_filter = pafMaskFilter and g not in [graph_event, referenceEvent]
        if zipped or do_filter:
            prepend_id_job = head_job.addChildJobFn(
                preprocess_input_sequence, g, seqID,
                project.inputSequenceMap[g], pafMaskFilter)
            updated_seq_id, mask_bed_id = prepend_id_job.rv(
                0), prepend_id_job.rv(1)
            if zipped:
                events.append(g)
                updated_seq_ids.append(updated_seq_id)
            if do_filter:
                mask_bed_ids[g] = mask_bed_id

    return head_job.addFollowOnJobFn(
        resolve_id_promises, events, updated_seq_ids,
        cactusWorkflowArguments).rv(), mask_bed_ids
예제 #25
0
    def testNestedResourcesDoNotBlock(self):
        """
        Resources are requested in the order Memory > Cpu > Disk.
        Test that inavailability of cpus for one job that is scheduled does not block another job
        that can run.
        """
        tempDir = self._createTempDir('testFiles')

        options = Job.Runner.getDefaultOptions(self._getTestJobStorePath())
        options.workDir = tempDir
        options.maxCores = 4
        from toil import physicalMemory
        availableMemory = physicalMemory()
        options.batchSystem = self.batchSystemName

        outFile = os.path.join(tempDir, 'counter')
        open(outFile, 'w').close()

        root = Job()

        blocker = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=30, writeVal='b',
                             cores=2, memory='1M', disk='1M')
        firstJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5, writeVal='fJ',
                              cores=1, memory='1M', disk='1M')
        secondJob = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=10,
                               writeVal='sJ', cores=1, memory='1M', disk='1M')

        # Should block off 50% of memory while waiting for it's 3 cores
        firstJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=0,
                                   writeVal='fJC', cores=3, memory=int(old_div(availableMemory,2)), disk='1M')

        # These two shouldn't be able to run before B because there should be only
        # (50% of memory - 1M) available (firstJobChild should be blocking 50%)
        secondJobChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                    writeVal='sJC', cores=2, memory=int(old_div(availableMemory,1.5)),
                                    disk='1M')
        secondJobGrandChild = Job.wrapFn(_resourceBlockTestAuxFn, outFile=outFile, sleepTime=5,
                                         writeVal='sJGC', cores=2, memory=int(old_div(availableMemory,1.5)),
                                         disk='1M')

        root.addChild(blocker)
        root.addChild(firstJob)
        root.addChild(secondJob)

        firstJob.addChild(firstJobChild)
        secondJob.addChild(secondJobChild)

        secondJobChild.addChild(secondJobGrandChild)
        """
        The tree is:
                    root
                  /   |   \
                 b    fJ   sJ
                      |    |
                      fJC  sJC
                           |
                           sJGC
        But the order of execution should be
        root > b , fJ, sJ > sJC > sJGC > fJC
        since fJC cannot run till bl finishes but sJC and sJGC can(fJC blocked by disk). If the
        resource acquisition is written properly, then fJC which is scheduled before sJC and sJGC
        should not block them, and should only run after they finish.
        """
        Job.Runner.startToil(root, options)
        with open(outFile) as oFH:
            outString = oFH.read()
        # The ordering of b, fJ and sJ is non-deterministic since they are scheduled at the same
        # time. We look for all possible permutations.
        possibleStarts = tuple([''.join(x) for x in itertools.permutations(['b', 'fJ', 'sJ'])])
        assert outString.startswith(possibleStarts)
        assert outString.endswith('sJCsJGCfJC')
예제 #26
0
def run_chunked_augmenting(job,
                           context,
                           graph_id,
                           graph_basename,
                           gam_id,
                           gam_basename,
                           batch_input=None,
                           all_path_components=False,
                           chunk_paths=[],
                           connected_component_chunking=False,
                           output_format=None,
                           augment_gam=False,
                           min_augment_coverage=None,
                           expected_coverage=None,
                           min_mapq=None,
                           min_baseq=None,
                           to_outstore=False):
    """
    Run a chunking job (if desired), then augment the results
    """

    # base case: only one input
    if batch_input is None:
        # chunk if necessary
        if all_path_components or connected_component_chunking or len(
                chunk_paths) > 1:
            child_job = Job()
            job.addChild(child_job)
            chunk_job = child_job.addChildJobFn(
                run_chunking,
                context,
                graph_id=graph_id,
                graph_basename=graph_basename,
                chunk_paths=chunk_paths,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                gam_id=gam_id,
                to_outstore=False,
                cores=context.config.chunk_cores,
                memory=context.config.chunk_mem,
                disk=context.config.chunk_disk)
            batch_input = chunk_job.rv()

            # recurse on chunks
            recurse_job = child_job.addFollowOnJobFn(
                run_chunked_augmenting,
                context,
                graph_id=None,
                graph_basename=None,
                gam_id=None,
                gam_basename=None,
                batch_input=batch_input,
                all_path_components=all_path_components,
                chunk_paths=chunk_paths,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                augment_gam=augment_gam,
                min_augment_coverage=min_augment_coverage,
                expected_coverage=expected_coverage,
                min_mapq=min_mapq,
                min_baseq=min_baseq,
                to_outstore=to_outstore)
            return recurse_job.rv()
        else:
            #phony up chunk output for single input
            batch_input = {'all': [graph_id, graph_basename]}
            if gam_id:
                batch_input['all'] += [gam_id, gam_basename]

    # run the augmenting on each chunk
    assert batch_input

    augment_results = []
    for chunk_name, chunk_results in list(batch_input.items()):
        augment_job = job.addChildJobFn(
            run_augmenting,
            context,
            graph_id=chunk_results[0],
            graph_basename=chunk_results[1],
            gam_id=chunk_results[2],
            gam_basename=chunk_results[3],
            augment_gam=augment_gam,
            min_augment_coverage=min_augment_coverage,
            expected_coverage=expected_coverage,
            min_mapq=min_mapq,
            min_baseq=min_baseq,
            to_outstore=to_outstore,
            cores=context.config.augment_cores,
            memory=context.config.augment_mem,
            disk=context.config.augment_disk)

        augment_results.append((chunk_name, augment_job.rv()))

    return augment_results
예제 #27
0
def run_all_calling2(job, context, xg_file_id, chr_gam_ids, chr_gam_idx_ids, chroms, path_sizes, vcf_offsets, sample_name,
                     genotype=False, out_name=None, recall=False, alt_gam_id=None, alt_gai_id=None,
                     genotype_vcf_id=None, genotype_tbi_id=None, id_ranges_id=None, snarls_id=None, pack_support=False,
                     old_call=False):
    """
    Call all the chromosomes and return a merged up vcf/tbi pair
    """
    # we make a child job so that all calling is encapsulated in a top-level job
    child_job = Job()
    job.addChild(child_job)
    vcf_ids = []
    tbi_ids = []
    call_timers_lists = []
    assert len(chr_gam_ids) > 0
    if not chr_gam_idx_ids:
        chr_gam_idx_ids = [None] * len(chr_gam_ids)
    if not chroms:
        chroms = [name for name in path_sizes.keys() if path_sizes[name] > 0]
    assert len(chr_gam_ids) == len(chr_gam_idx_ids)
    # id ranges deactivates path chunking
    if id_ranges_id:
        context.config.call_chunk_size = (2 << 30) - 1
        context.config.overlap = 0
    for i in range(len(chr_gam_ids)):
        alignment_file_id = chr_gam_ids[i]
        alignment_index_id = chr_gam_idx_ids[i]
        if len(chr_gam_ids) > 1:
            # 1 gam per chromosome
            chr_label = [chroms[i]]
            chr_offset = [vcf_offsets[i]] if vcf_offsets else [0]
        else:
            # single gam with one or more chromosomes
            chr_label = chroms
            chr_offset = vcf_offsets if vcf_offsets else [0] * len(chroms)
        chunk_job = child_job.addChildJobFn(run_chunking, context, xg_file_id,
                                            alignment_file_id, alignment_index_id, chr_label, chr_offset, path_sizes,
                                            sample_name, genotype=genotype, recall=recall,
                                            alt_gam_id=alt_gam_id, alt_gai_id=alt_gai_id,
                                            genotype_vcf_id=genotype_vcf_id,
                                            genotype_tbi_id=genotype_tbi_id,
                                            id_ranges_id=id_ranges_id,
                                            cores=context.config.call_chunk_cores,
                                            memory=context.config.call_chunk_mem,
                                            disk=context.config.call_chunk_disk)
        call_job = chunk_job.addFollowOnJobFn(run_chunked_calling, context, chunk_job.rv(0),
                                              genotype, recall, snarls_id, pack_support, old_call, chunk_job.rv(1),
                                              cores=context.config.misc_cores,
                                              memory=context.config.misc_mem,
                                              disk=context.config.misc_disk)
        vcf_ids.append(call_job.rv(0))
        tbi_ids.append(call_job.rv(1))
        call_timers_lists.append(call_job.rv(2))
        
    if not out_name:
        out_name = sample_name
    return child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, vcf_ids, tbi_ids,
                                      write_to_outstore = True,
                                      call_timers_lists = call_timers_lists,
                                      cores=context.config.call_chunk_cores,
                                      memory=context.config.call_chunk_mem,
                                      disk=context.config.call_chunk_disk).rv()
예제 #28
0
def run_whole_alignment(job, context, fastq, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper,
                        indexes, reads_chunk_ids,
                        bam_output=False, surject=False, gbwt_penalty=None, validate=False, fasta_dict_id=None):
    """
    align all fastq chunks in parallel
    
    Takes a dict from index type to index file ID. Some indexes are extra and
    specifying them will change mapping behavior.
    
    Returns a list of per-contig GAMs, the total allignment runtime, and a list
    of per-contig BAM file IDs (which is only nonempty when surject is true).
    
    """
    
    # this will be a list of lists.
    # gam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges)
    # for the ith gam chunk (generated from fastq shard i)
    gam_chunk_file_ids = []
    gam_chunk_running_times = []
    # depending on bam_output and surject options, we can make bam_output too
    bam_chunk_file_ids = []

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)):
        #Run graph alignment on each fastq chunk
        chunk_alignment_job = child_job.addChildJobFn(run_chunk_alignment, context, gam_input_reads, bam_input_reads,
                                                      sample_name,
                                                      interleaved, mapper, chunk_filename_ids, chunk_id,
                                                      indexes,
                                                      bam_output=bam_output,
                                                      gbwt_penalty=gbwt_penalty,
                                                      validate=validate,
                                                      fasta_dict_id=fasta_dict_id,
                                                      cores=context.config.alignment_cores, memory=context.config.alignment_mem,
                                                      disk=context.config.alignment_disk)
        if not bam_output:
            gam_chunk_file_ids.append(chunk_alignment_job.rv(0))
        else:
            bam_chunk_file_ids.append(chunk_alignment_job.rv(0))
        gam_chunk_running_times.append(chunk_alignment_job.rv(1))


    if not bam_output:
        merge_gams_job = child_job.addFollowOnJobFn(run_merge_gams, context, sample_name, indexes.get('id_ranges'), gam_chunk_file_ids,
                                                    gam_chunk_running_times,
                                                    cores=context.config.misc_cores,
                                                    memory=context.config.misc_mem, disk=context.config.misc_disk)
        gam_chrom_ids = merge_gams_job.rv(0)
        gam_chunk_time = merge_gams_job.rv(1)
        bam_chrom_ids = []
    else:
        gam_chrom_ids = []
        gam_chunk_time = None
        merge_bams_job = child_job.addFollowOnJobFn(run_merge_bams, context, sample_name, bam_chunk_file_ids,
                                                        cores=context.config.misc_cores,
                                                        memory=context.config.misc_mem, disk=context.config.misc_disk)
        split_bams_job = merge_bams_job.addFollowOnJobFn(split_bam_into_chroms, context, indexes.get('id_ranges'), merge_bams_job.rv(),
                                                            cores=context.config.alignment_cores, memory=context.config.alignment_mem,
                                                            disk=context.config.alignment_disk)
        bam_chrom_ids = split_bams_job.rv()

    if surject:
        interleaved_surject = interleaved or (fastq and len(fastq) == 2)
        zip_job = child_job.addFollowOnJobFn(run_zip_surject_input, context, gam_chunk_file_ids)
        xg_id = indexes['xg-surject'] if 'xg-surject' in indexes else indexes['xg']
        bam_chrom_ids = [zip_job.addFollowOnJobFn(run_whole_surject, context, zip_job.rv(), sample_name + '-surject',
                                                  interleaved_surject, xg_id, []).rv()]

    return gam_chrom_ids, gam_chunk_time, bam_chrom_ids
예제 #29
0
def get_plan(options, project, inSeqFile, outSeqFile, toil):

    plan = get_generation_info() + '\n'

    if options.wdl:
        plan += wdl_workflow_start(options, inSeqFile)
        options.pp_map = {}

    if options.toil:
        # kick things off with an empty job which we will hook subsequent jobs onto
        # (using RoundedJob because root job must be sublcass of Job,
        #  https://github.com/ComparativeGenomicsToolkit/cactus/pull/284#issuecomment-684125478)
        start_job = RoundedJob()
        parent_job = start_job
        job_idx = {}
    
    # preprocessing
    plan += '\n## Preprocessor\n'
    leaves = [outSeqFile.tree.getName(leaf) for leaf in outSeqFile.tree.getLeaves()]
    for i in range(0, len(leaves), options.preprocessBatchSize):
        pre_batch = leaves[i:i+options.preprocessBatchSize]
        if options.wdl:
            plan += wdl_call_preprocess(options, inSeqFile, outSeqFile, pre_batch)
        elif options.toil:
            job_idx[("preprocess", leaves[i])] = parent_job.addChildJobFn(toil_call_preprocess, options, inSeqFile, outSeqFile, leaves[i],
                                                                          cores=options.preprocessCores,
                                                                          memory=options.preprocessMemory,
                                                                          disk=options.preprocessDisk)
        else:
            plan += 'cactus-preprocess {} {} {} --inputNames {} {} {}\n'.format(
                get_jobstore(options), options.seqFile, options.outSeqFile, ' '.join(pre_batch),
                options.cactusOptions, get_toil_resource_opts(options, 'preprocess'))

    if options.preprocessOnly:
        plan += '\n## Cactus\n'
        plan += 'cactus {} {} {} {}\n'.format(get_jobstore(options), options.outSeqFile,
                                              options.outHal, options.cactusOptions)
        return plan

    # shedule up the alignments
    schedule = Schedule()
    schedule.loadProject(project)
    schedule.compute()

    # set of all jobs, as genome names from the (fully resolved, output) seqfile
    events = set(outSeqFile.pathMap.keys()) - set(leaves)
    resolved = set(leaves)

    # convert follow-ons to dependencies
    follow_on_deps = {}
    for event in events:
        fo = schedule.followOn(event)
        if fo:
            follow_on_deps[fo] = event

    def get_deps(event):
        deps = set(schedule.deps(event))
        if event in follow_on_deps:
            deps = deps.union(set(follow_on_deps[event]))
        # I don't know why the schedule doesn't always give the children
        # todo: understand!
        try:
            has_name = outSeqFile.tree.getNodeId(event) is not None
        except:
            has_name = False
        if has_name:
            for node in outSeqFile.tree.getChildren(outSeqFile.tree.getNodeId(event)):
                if not outSeqFile.tree.isLeaf(node):
                    deps.add(outSeqFile.tree.getName(node))
        return deps

    events_and_virtuals = set(events)
    # add all events, potentially looping through virtual dependency chains
    # (hence the double loop)
    batch = set(events_and_virtuals)
    while len(batch) > 0:
        next_batch = set()
        for event in batch:
            for dep in get_deps(event):
                if dep not in events_and_virtuals:
                    next_batch.add(dep)
                    events_and_virtuals.add(dep)
        batch = next_batch

    # group jobs into rounds.  where all jobs of round i can be run in parallel
    groups = []
    while len(events_and_virtuals) > 0:
        group = []
        to_remove = []
        added = 0
        for event in events_and_virtuals:
            if all([dep in resolved for dep in get_deps(event)]):
                if not schedule.isVirtual(event):
                    group.append(event)
                to_remove.append(event)
                added += 1
        if added == 0:
            sys.stderr.write("schedule deadlock:\n")
            for event in events_and_virtuals:
                sys.stderr.write("{} has deps {}\n".format(event, get_deps(event)))
            sys.exit(1)
        for tr in to_remove:
            resolved.add(tr)
            events_and_virtuals.remove(tr)
        groups.append(group)

    def halPath(event):
        if event == project.mcTree.getRootName():
            return options.outHal
        else:
            return os.path.join(options.outDir, event + '.hal')
    def cigarPath(event):
        return os.path.join(options.outDir, event + '.cigar')

    # alignment groups
    plan += '\n## Alignment\n'
    for i, group in enumerate(groups):
        plan += '\n### Round {}'.format(i)
        if options.toil:
            # advance toil phase
            # todo: recapitulate exact dependencies
            parent_job = parent_job.addFollowOn(Job())
        for event in sorted(group):
            plan += '\n'
            if options.wdl:
                plan += wdl_call_blast(options, project, event, cigarPath(event))
                plan += wdl_call_align(options, project, event, cigarPath(event), halPath(event), outSeqFile.pathMap[event])
            elif options.toil:
                # promises only get fulfilleed if they are passed directly as arguments to the toil job, so we pull out the ones we need here
                leaf_deps, anc_deps = get_dep_names(options, project, event)
                fa_promises = [job_idx[("preprocess", dep)].rv() for dep in leaf_deps] + [job_idx[("align", dep)].rv(0) for dep in anc_deps]
                job_idx[("blast", event)] = parent_job.addChildJobFn(toil_call_blast,
                                                                     options,
                                                                     outSeqFile,
                                                                     project,
                                                                     event,
                                                                     cigarPath(event),
                                                                     leaf_deps + anc_deps,
                                                                     *fa_promises,
                                                                     cores=options.blastCores,
                                                                     memory=options.blastMemory,
                                                                     disk=options.preprocessDisk)
                job_idx[("align", event)] = job_idx[("blast", event)].addFollowOnJobFn(toil_call_align,
                                                                                       options, outSeqFile,
                                                                                       project,
                                                                                       event,
                                                                                       cigarPath(event),
                                                                                       halPath(event),
                                                                                       outSeqFile.pathMap[event],
                                                                                       job_idx[("blast", event)].rv(),
                                                                                       leaf_deps + anc_deps, *fa_promises,
                                                                                       cores=options.alignCores,
                                                                                       memory=options.alignMemory,
                                                                                       disk=options.alignDisk)
            else:
                # todo: support cactus interface (it's easy enough here, but cactus_progressive.py needs changes to handle)
                plan += 'cactus-blast {} {} {} --root {} {} {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'blast'))
                plan += 'cactus-align {} {} {} {} --root {} {} {} --database {}\n'.format(
                    get_jobstore(options), options.outSeqFile, cigarPath(event), halPath(event), event,
                    options.cactusOptions, get_toil_resource_opts(options, 'align'), options.database)
                # todo: just output the fasta in cactus-align.
                plan += 'hal2fasta {} {} {} > {}\n'.format(halPath(event), event, options.halOptions, outSeqFile.pathMap[event])

    # advance toil phase
    if options.toil:
        parent_job = parent_job.addFollowOn(Job())
                
    # stitch together the final tree
    plan += '\n## HAL merging\n'
    root = project.mcTree.getRootName()
    prev_event = None
    append_count = 0
    event_list = []
    for group in reversed(groups):
        for event in group:
            if event != root:
                if options.wdl:
                    plan += wdl_call_hal_append(options, project, event, prev_event)
                elif not options.toil:
                    plan += 'halAppendSubtree {} {} {} {} --merge {}\n'.format(
                        halPath(root), halPath(event), event, event, options.halOptions)
                append_count += 1
                event_list.append(event)
            prev_event = event

    if options.toil:
        job_idx['hal_append'] = parent_job.addChildJobFn(toil_call_hal_append_subtrees,
                                                         options,
                                                         project,
                                                         root,
                                                         job_idx[('align', root)].rv(1),
                                                         event_list,
                                                         *[job_idx[('align', e)].rv(1) for e in event_list],
                                                         cores=1,
                                                         memory=options.alignMemory,
                                                         disk=options.halAppendDisk)

    if options.wdl:
        plan += wdl_workflow_end(options, prev_event, append_count > 1)

    if options.toil:
        start_time = timeit.default_timer()
        toil.start(start_job)
        end_time = timeit.default_timer()
        run_time = end_time - start_time
        logger.info("cactus-prepare-toil has finished after {} seconds".format(run_time))
        
    return plan
예제 #30
0
def run_chunked_calling(job, context, chunk_infos, genotype, recall, snarls_id, pack_support, old_call, call_timers):
    """
    spawn a calling job for each chunk then merge them together
    """
    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    path_names = set()

    # If no chunking and many paths, we augment once first and not before calling
    # so we don't waste resources augmenting the same graph again and again
    # Note: should only do this when len(chunk_infos) > 1, but leaving as is so the tests hit it!
    if context.config.call_chunk_size == 0:
        chunk_info = chunk_infos[0]
        augment_job = child_job.addChildJobFn(
            run_vg_call,
            context,
            chunk_info['sample'],
            chunk_info['vg_id'],
            chunk_info['gam_id'],
            xg_id = chunk_info['xg_id'],
            path_names = [chunk_info['chrom']],
            seq_names = [chunk_info['chrom']],
            seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']],
            seq_lengths = [chunk_info['path_size']],
            chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']),
            genotype = genotype,
            recall = recall,
            clip_info = chunk_info,
            augment_only = True,
            pack_support = pack_support,
            alt_gam_id = chunk_info['alt_gam_id'],
            old_call = old_call,
            cores=context.config.calling_cores,
            memory=context.config.calling_mem, disk=context.config.calling_disk)
        augment_results = augment_job.rv()
        next_job = Job()
        augment_job.addFollowOn(next_job)
        child_job = next_job
    else:
        augment_results = None
    
    clip_file_ids = []
    for chunk_info in chunk_infos:
        path_names.add(chunk_info['chrom'])

        # Run vg call
        call_job = child_job.addChildJobFn(
            run_vg_call,
            context,
            chunk_info['sample'],
            chunk_info['vg_id'],
            chunk_info['gam_id'],
            xg_id = chunk_info['xg_id'],
            path_names = [chunk_info['chrom']],
            seq_names = [chunk_info['chrom']],
            seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']],
            seq_lengths = [chunk_info['path_size']],
            chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']),
            genotype = genotype,
            recall = recall,
            clip_info = chunk_info,
            alt_gam_id = chunk_info['alt_gam_id'],
            genotype_vcf_id = chunk_info['genotype_vcf_id'],
            genotype_tbi_id = chunk_info['genotype_tbi_id'],
            snarls_id = snarls_id,            
            pack_support = pack_support,
            old_call = old_call,            
            augment_results = augment_results,
            cores=context.config.calling_cores,
            memory=context.config.calling_mem, disk=context.config.calling_disk)
        vcf_id, call_timer = call_job.rv(0), call_job.rv(1)
        
        clip_file_ids.append(vcf_id)
        call_timers.append(call_timer)

    tag = list(path_names)[0] if len(path_names) == 1 else 'chroms'
        
    merge_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, tag,
                                           clip_file_ids,
                                           cores=context.config.call_chunk_cores,
                                           memory=context.config.call_chunk_mem,
                                           disk=context.config.call_chunk_disk)
        
    vcf_out_file_id = merge_job.rv(0)
    tbi_out_file_id = merge_job.rv(1)
    
    return vcf_out_file_id, tbi_out_file_id, call_timers