示例#1
0
def graphmap_join_workflow(job, options, config, vg_ids, hal_ids):

    root_job = Job()
    job.addChild(root_job)

    # run clip-vg on each input
    clipped_vg_ids = []
    for vg_path, vg_id in zip(options.vg, vg_ids):
        clip_job = root_job.addChildJobFn(clip_vg,
                                          options,
                                          config,
                                          vg_path,
                                          vg_id,
                                          disk=vg_id.size * 2,
                                          memory=vg_id.size * 4)
        clipped_vg_ids.append(clip_job.rv())

    # join the ids
    join_job = root_job.addFollowOnJobFn(join_vg,
                                         options,
                                         config,
                                         clipped_vg_ids,
                                         disk=sum([f.size for f in vg_ids]))
    clipped_vg_ids = join_job.rv()

    # make a gfa for each
    gfa_root_job = Job()
    join_job.addFollowOn(gfa_root_job)
    clipped_gfa_ids = []
    for i in range(len(options.vg)):
        vg_path = options.vg[i]
        clipped_id = join_job.rv(i)
        vg_id = vg_ids[i]
        gfa_job = gfa_root_job.addChildJobFn(vg_to_gfa,
                                             options,
                                             config,
                                             vg_path,
                                             clipped_id,
                                             disk=vg_id.size * 5)
        clipped_gfa_ids.append(gfa_job.rv())

    # merge up the gfas and make the various vg indexes
    gfa_merge_job = gfa_root_job.addFollowOnJobFn(
        vg_indexes,
        options,
        config,
        clipped_gfa_ids,
        cores=options.indexCores,
        disk=sum(f.size for f in vg_ids) * 5)

    if hal_ids:
        merge_hal_id = job.addChildJobFn(merge_hal,
                                         options,
                                         hal_ids,
                                         disk=sum(f.size
                                                  for f in hal_ids) * 2).rv()
    else:
        merge_hal_id = None

    return clipped_vg_ids, gfa_merge_job.rv(), merge_hal_id
示例#2
0
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"),
                                 "cpu",
                                 typeFn=int,
                                 default=1)
    mg_cores = min(mg_cores, cpu_count())

    # doing the paf conversion is more efficient when done separately for each genome.  we can get away
    # with doing this if the universal filter (which needs to process everything at once) is disabled
    xml_node = findRequiredNode(config.xmlRoot, "graphmap")
    paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter",
                                           float)

    # do the mapping
    gaf_id_map = {}
    paf_id_map = {}

    for event, fa_path_fa_id in fa_id_map.items():
        fa_path = fa_path_fa_id[0]
        fa_id = fa_path_fa_id[1]
        minigraph_map_job = top_job.addChildJobFn(
            minigraph_map_one,
            config,
            event,
            fa_path,
            fa_id,
            gfa_id,
            keep_gaf or not paf_per_genome,
            paf_per_genome,
            # todo: estimate RAM
            cores=mg_cores,
            disk=5 * (fa_id.size + gfa_id.size))
        gaf_id_map[event] = minigraph_map_job.rv(0)
        paf_id_map[event] = minigraph_map_job.rv(1)

    # convert to paf
    if paf_per_genome:
        paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map)
    else:
        paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config,
                                           gaf_id_map)

    if not keep_gaf:
        gaf_id_map = None
    else:
        gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv()

    return paf_job.rv(), gaf_id_map
示例#3
0
def run_all_bam_caller(job, context, fasta_file_id, bam_file_id, bam_idx_id,
                       sample_name, chroms, offsets, out_name, bam_caller,
                       bam_caller_opts = []):
    """
    run freebayes or platypus on a set of chromosomal regions.  this is done by sending each region to a 
    child job and farming off the entire input to each (ie not splitting the input)
    """
    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    fb_vcf_ids = []
    fb_tbi_ids = []
    fb_timers = []
    assert chroms
    if not offsets:
        offsets = [None] * len(chroms)
    for chrom, offset in zip(chroms, offsets):
        fb_job = child_job.addChildJobFn(run_bam_caller, context, fasta_file_id, bam_file_id, bam_idx_id,
                                         sample_name, chrom, offset, out_name, bam_caller, bam_caller_opts,
                                         memory=context.config.calling_mem,
                                         disk=context.config.calling_disk)
        fb_vcf_ids.append(fb_job.rv(0))
        fb_tbi_ids.append(fb_job.rv(1))
        fb_timers.append([fb_job.rv(2)])

    merge_vcf_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, fb_vcf_ids, fb_tbi_ids,
                                               write_to_outstore = True, call_timers_lists = fb_timers)
    return merge_vcf_job.rv()
示例#4
0
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved,
                   xg_file_id, paths):
    """ split the fastq, then surject each chunk.  returns outputgams, paired with total surject time
    (excluding toil-vg overhead such as transferring and splitting files )"""

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    if not context.config.single_reads_chunk:
        reads_chunk_ids = child_job.addChildJobFn(
            run_split_reads,
            context,
            None,
            'aln.gam',
            None, [gam_input_reads_id],
            cores=context.config.misc_cores,
            memory=context.config.misc_mem,
            disk=context.config.misc_disk).rv()
    else:
        RealtimeLogger.info(
            "Bypassing reads splitting because --single_reads_chunk enabled")
        reads_chunk_ids = [[r] for r in [gam_input_reads_id]]

    return child_job.addFollowOnJobFn(run_whole_surject,
                                      context,
                                      reads_chunk_ids,
                                      output_name,
                                      interleaved,
                                      xg_file_id,
                                      paths,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk).rv()
示例#5
0
def minigraph_map_all(job, config, gfa_id, fa_id_map):
    """ top-level job to run the minigraph mapping in parallel, returns paf """

    # hang everything on this job, to self-contain workflow
    top_job = Job()
    job.addChild(top_job)

    # do the mapping
    gaf_ids = []
    for event, fa_id in fa_id_map.items():
        RealtimeLogger.info("adding child event={} faid={} gfaid={}".format(
            event, fa_id, gfa_id))
        minigraph_map_job = top_job.addChildJobFn(minigraph_map_one,
                                                  config,
                                                  event,
                                                  fa_id,
                                                  gfa_id,
                                                  cores=1,
                                                  disk=5 *
                                                  (fa_id.size + gfa_id.size))
        gaf_ids.append(minigraph_map_job.rv())

    # convert to paf
    paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_ids)

    return paf_job.rv()
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path, paf_id, paf_path, ref_contigs, other_contig):

    root_job = Job()
    job.addChild(root_job)

    # get the sizes before we overwrite below
    gfa_size = gfa_id.size
    paf_size = paf_id.size
    
    # use file extension to sniff out compressed input
    if gfa_path.endswith(".gz"):
        gfa_id = root_job.addChildJobFn(unzip_gz, gfa_path, gfa_id, disk=gfa_id.size * 10).rv()
        gfa_size *= 10
    if paf_path.endswith(".gz"):
        paf_id = root_job.addChildJobFn(unzip_gz, paf_path, paf_id, disk=paf_id.size * 10).rv()
        paf_size *= 10

    mask_bed_id = None
    if options.maskFilter:
        mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap, options.maskFilter).rv()
        
    # use rgfa-split to split the gfa and paf up by contig
    split_gfa_job = root_job.addFollowOnJobFn(split_gfa, config, gfa_id, [paf_id], ref_contigs,
                                              other_contig, options.reference, mask_bed_id,
                                              disk=(gfa_size + paf_size) * 5)

    # use the output of the above splitting to do the fasta splitting
    split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap, split_gfa_job.rv(0))

    # gather everythign up into a table
    gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap, split_gfa_job.rv(0), split_fas_job.rv())

    # try splitting the ambiguous sequences using minimap2, which is more sensitive in some cases
    remap_job = gather_fas_job.addFollowOnJobFn(split_minimap_fallback, options, config, seqIDMap, gather_fas_job.rv())

    # partition these into fasta files
    split_fallback_gfa_job = remap_job.addFollowOnJobFn(split_gfa, config, None, remap_job.rv(0), ref_contigs,
                                                        other_contig, options.reference, None,
                                                        disk=(gfa_size + paf_size) * 5)

    # use the output of the above to split the ambiguous fastas
    split_fallback_fas_job = split_fallback_gfa_job.addFollowOnJobFn(split_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0))

    # gather the fallback contigs into a table
    gather_fallback_fas_job = split_fallback_fas_job.addFollowOnJobFn(gather_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0),
                                                                      split_fallback_fas_job.rv())

    # combine the split sequences with the split ambigious sequences
    combine_split_job = gather_fallback_fas_job.addFollowOnJobFn(combine_splits, options, config, seqIDMap, gather_fas_job.rv(),
                                                                 gather_fallback_fas_job.rv())

    # return all the files, as well as the 2 split logs
    return (seqIDMap, combine_split_job.rv(), split_gfa_job.rv(1), split_fallback_gfa_job.rv(1))
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path,
                            paf_id, paf_path, ref_contigs, other_contig):

    root_job = Job()
    job.addChild(root_job)

    # get the sizes before we overwrite below
    gfa_size = gfa_id.size
    paf_size = paf_id.size

    # use file extension to sniff out compressed input
    if gfa_path.endswith(".gz"):
        gfa_id = root_job.addChildJobFn(unzip_gz,
                                        gfa_path,
                                        gfa_id,
                                        disk=gfa_id.size * 10).rv()
        gfa_size *= 10
    if paf_path.endswith(".gz"):
        paf_id = root_job.addChildJobFn(unzip_gz,
                                        paf_path,
                                        paf_id,
                                        disk=paf_id.size * 10).rv()
        paf_size *= 10

    mask_bed_id = None
    if options.maskFilter:
        mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap,
                                             options.maskFilter).rv()

    # use rgfa-split to split the gfa and paf up by contig
    split_gfa_job = root_job.addFollowOnJobFn(split_gfa,
                                              config,
                                              gfa_id,
                                              paf_id,
                                              ref_contigs,
                                              other_contig,
                                              options.reference,
                                              mask_bed_id,
                                              disk=(gfa_size + paf_size) * 5)

    # use the output of the above splitting to do the fasta splitting
    split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap,
                                                   split_gfa_job.rv())

    # gather everythign up into a table
    gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap,
                                                    split_gfa_job.rv(),
                                                    split_fas_job.rv())

    # return all the files
    return gather_fas_job.rv()
示例#8
0
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project, checkpointInfo, doRenaming, pafInput, pafSecondaries, doVG, doGFA, delay=0, eventNameAsID=False):
    # this option (--stagger) can be used in batch mode to avoid starting all the alignment jobs at the same time
    time.sleep(delay)
    
    head_job = Job()
    job.addChild(head_job)

    # allow for input in paf format:
    if pafInput:
        # convert the paf input to lastz format, splitting out into primary and secondary files
        paf_to_lastz_job = head_job.addChildJobFn(paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID, True)
        cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0)
        cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1) if pafSecondaries else None

    # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc
    # and the cigar files match up.  If reading cactus-blast output, the cigars are fine, just need
    # the fastas (todo: make this less hacky somehow)
    cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids, cactusWorkflowArguments, project, doRenaming, eventNameAsID,
                                        #todo disk=
    )
    no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs
    cactusWorkflowArguments = cur_job.rv()
    
    if no_ingroup_coverage:
        # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage
        cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage, cactusWorkflowArguments, project)
        cactusWorkflowArguments = cur_job.rv()

    # run cactus setup all the way through cactus2hal generation
    setup_job = cur_job.addFollowOnJobFn(run_setup_phase, cactusWorkflowArguments)

    # set up the project
    prepare_hal_export_job = setup_job.addFollowOnJobFn(run_prepare_hal_export, project, setup_job.rv())

    # create the hal
    hal_export_job = prepare_hal_export_job.addFollowOnJobFn(exportHal, prepare_hal_export_job.rv(0), event=prepare_hal_export_job.rv(1),
                                                             checkpointInfo=checkpointInfo,
                                                             memory=configWrapper.getDefaultMemory(),
                                                             disk=configWrapper.getExportHalDisk(),
                                                             preemptable=False)

    # optionally create the VG
    if doVG or doGFA:
        vg_export_job = hal_export_job.addFollowOnJobFn(export_vg, hal_export_job.rv(), configWrapper, doVG, doGFA,
                                                        checkpointInfo=checkpointInfo)
        vg_file_id, gfa_file_id = vg_export_job.rv(0), vg_export_job.rv(1)
    else:
        vg_file_id, gfa_file_id = None, None
        
    return hal_export_job.rv(), vg_file_id, gfa_file_id
示例#9
0
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved,
                      xg_file_id, paths):
    """
    Surject all gam chunks in parallel.
    
    surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name.
    
    If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output.
    
    Surjects against the given collection of paths in the given XG file.
    
    """

    RealtimeLogger.info(
        "Surjecting read chunks {} to BAM".format(reads_chunk_ids))

    # this will be a list of lists.
    # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges)
    # for the ith gam chunk (generated from fastq shard i)
    bam_chunk_file_ids = []
    bam_chunk_running_times = []

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)):
        #Run graph surject on each gam chunk
        chunk_surject_job = child_job.addChildJobFn(
            run_chunk_surject,
            context,
            interleaved,
            xg_file_id,
            paths,
            chunk_filename_ids,
            '{}_chunk{}'.format(output_name, chunk_id),
            cores=context.config.alignment_cores,
            memory=context.config.alignment_mem,
            disk=context.config.alignment_disk)
        bam_chunk_file_ids.append(chunk_surject_job.rv(0))
        bam_chunk_running_times.append(chunk_surject_job.rv(1))

    return child_job.addFollowOnJobFn(run_merge_bams,
                                      context,
                                      output_name,
                                      bam_chunk_file_ids,
                                      cores=context.config.misc_cores,
                                      memory=context.config.misc_mem,
                                      disk=context.config.misc_disk).rv()
示例#10
0
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project,
                     doRenaming, pafInput):
    head_job = Job()
    job.addChild(head_job)

    # allow for input in paf format:
    if pafInput:
        # convert the paf input to lastz format, splitting out into primary and secondary files
        paf_to_lastz_job = head_job.addChildJobFn(
            paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID,
            True)
        cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0)
        cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1)

    # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc
    # and the cigar files match up.  If reading cactus-blast output, the cigars are fine, just need
    # the fastas (todo: make this less hacky somehow)
    cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids,
                                        cactusWorkflowArguments, project,
                                        doRenaming
                                        #todo disk=
                                        )
    no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs
    cactusWorkflowArguments = cur_job.rv()

    if no_ingroup_coverage:
        # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage
        cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage,
                                           cactusWorkflowArguments, project)
        cactusWorkflowArguments = cur_job.rv()

    # run cactus setup all the way through cactus2hal generation
    setup_job = cur_job.addFollowOnJobFn(run_setup_phase,
                                         cactusWorkflowArguments)

    # set up the project
    prepare_hal_export_job = setup_job.addFollowOnJobFn(
        run_prepare_hal_export, project, setup_job.rv())

    # create the hal
    hal_export_job = prepare_hal_export_job.addFollowOnJobFn(
        exportHal,
        prepare_hal_export_job.rv(0),
        event=prepare_hal_export_job.rv(1),
        memory=configWrapper.getDefaultMemory(),
        disk=configWrapper.getExportHalDisk(),
        preemptable=False)
    return hal_export_job.rv()
示例#11
0
def combine_splits(job, config, seq_id_map, original_id_map, remap_id_map):
    """ combine the output of two runs of gather_fas.  the first is the contigs determined by minigraph,
    the second from remapping the ambigious contigs with minimap2 """

    root_job = Job()
    job.addChild(root_job)

    # no ambiguous remappings, nothing to do
    if not remap_id_map or len(remap_id_map) == 0:
        return original_id_map

    amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                  "graphmap_split"),
                                 "ambiguousName",
                                 default="_AMBIGUOUS_")
    graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")

    # note: we're not handling case where 100% of a given reference contigs are ambiguous
    for ref_contig in original_id_map:
        if ref_contig == amb_name:
            # for ambiguous sequence, we overwrite and don't combine
            if ref_contig in remap_id_map:
                original_id_map[ref_contig] = remap_id_map[ref_contig]
            else:
                original_id_map[ref_contig] = None
        elif ref_contig in remap_id_map:
            total_size = 0
            for event in original_id_map[ref_contig]['fa']:
                total_size += original_id_map[ref_contig]['fa'][event].size
                if event in remap_id_map[ref_contig]['fa']:
                    total_size += remap_id_map[ref_contig]['fa'][event].size
            original_id_map[ref_contig] = root_job.addChildJobFn(
                combine_ref_contig_splits,
                original_id_map[ref_contig],
                remap_id_map[ref_contig],
                disk=total_size * 4).rv()

    return root_job.addFollowOnJobFn(combine_paf_splits, seq_id_map,
                                     original_id_map, remap_id_map, amb_name,
                                     graph_event).rv()
示例#12
0
def preprocess_input_sequences(job,
                               configWrapper,
                               project,
                               cactusWorkflowArguments,
                               pafMaskFilter=None,
                               referenceEvent=None):
    """ update the workflow arguments in place with unzipped version of any input fastas whose paths 
    end in .gz, 
    if there's a pafMaskFilter, softmasked regions are extracted from each sequence into a bed.
    Note that the beds will need unique ids prepended just like the fastas...
    """
    head_job = Job()
    job.addChild(head_job)
    graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot,
                                                     "graphmap"),
                                    "assemblyName",
                                    default="_MINIGRAPH_")
    exp = cactusWorkflowArguments.experimentWrapper
    ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g))
                              for g in exp.getGenomesWithSequence()
                              if g not in exp.getOutgroupGenomes()]
    mask_bed_ids = {}
    events = []
    updated_seq_ids = []
    for g, seqID in ingroupsAndOriginalIDs:
        zipped = project.inputSequenceMap[g].endswith('.gz')
        do_filter = pafMaskFilter and g not in [graph_event, referenceEvent]
        if zipped or do_filter:
            prepend_id_job = head_job.addChildJobFn(
                preprocess_input_sequence, g, seqID,
                project.inputSequenceMap[g], pafMaskFilter)
            updated_seq_id, mask_bed_id = prepend_id_job.rv(
                0), prepend_id_job.rv(1)
            if zipped:
                events.append(g)
                updated_seq_ids.append(updated_seq_id)
            if do_filter:
                mask_bed_ids[g] = mask_bed_id

    return head_job.addFollowOnJobFn(
        resolve_id_promises, events, updated_seq_ids,
        cactusWorkflowArguments).rv(), mask_bed_ids
示例#13
0
def run_calleval(job, context, xg_ids, gam_ids, gam_idx_ids, bam_ids, bam_idx_ids, gam_names, bam_names,
                 vcfeval_baseline_id, vcfeval_baseline_tbi_id, caller_fasta_id, vcfeval_fasta_id,
                 bed_id, clip_only, call, sample_name, chroms, vcf_offsets,
                 vcfeval_score_field, plot_sets, surject, interleaved,
                 freebayes, platypus, happy, sveval, recall, min_sv_len, max_sv_len, sv_overlap,
                 sv_region_overlap, normalize, ins_ref_len, del_min_rol, ins_seq_comp,
                 min_mapq=0, min_baseq=0, min_augment_coverage=0):
    """
    top-level call-eval function. Runs the caller on every
    gam, and freebayes on every bam. The resulting vcfs are put through
    vcfeval and the accuracies are tabulated in the output.
    
    Returns the output of run_calleval results, a list of condition names, a
    list of corresponding called VCF.gz and index ID pairs, and dicts of
    vcfeval and happy result dicts, by condition name and clipped/unclipped
    status.

    plot_sets is a data structure of collections of conditions to plot against
    each other, as produced by parse_plot_sets.
    
    """
    
    # We store the name of each condition we run
    names = []
    # And we build up these result lists in sync with the name list
    vcf_tbi_id_pairs = []
    timing_results = []
    
    # Here we accumulate vcf_eval comparison results in a dict by condition name, then clipping status ("clipped", "unclipped").
    # Each contained dict is the output dict from run_vcfeval
    eval_results = collections.defaultdict(dict)
    # And here we store similarly the output dicts from run_happy
    happy_results = collections.defaultdict(dict)
    # And here we store similarly the output dicts from run_sveval
    sveval_results = collections.defaultdict(dict)

    # Some prep work (surjection and truth extraction) will happen under this head job
    head_job = Job()
    job.addChild(head_job)

    # Most of our work will run under this child job
    child_job = Job()
    head_job.addFollowOn(child_job)
    
    
    # We always extract a single-sample VCF from the truth, to save time
    # picking through all its samples multiple times over later. This should
    # also save memory. TODO: should we define a separate disk/memory requirement set?
    sample_extract_job = head_job.addChildJobFn(run_make_control_vcfs, context, vcfeval_baseline_id, 'baseline.vcf.gz',
                                                vcfeval_baseline_tbi_id, sample_name, pos_only = True,
                                                no_filter_if_sample_not_found = True,
                                                cores=context.config.vcfeval_cores,
                                                memory=context.config.vcfeval_mem,
                                                disk=context.config.vcfeval_disk)

    truth_vcf_id = sample_extract_job.rv(0)
    truth_vcf_tbi_id = sample_extract_job.rv(1)

    if not gam_idx_ids:
        gam_idx_ids = [None] * len(gam_ids)
    assert len(gam_idx_ids) == len(gam_ids)
    
    if surject:
        # optionally surject all the gams into bams
        for xg_id, gam_name, gam_id in zip(xg_ids, gam_names, gam_ids):
            surject_job = head_job.addChildJobFn(run_surjecting, context, gam_id, gam_name + '-surject',
                                                 interleaved, xg_id, chroms, cores=context.config.misc_cores,
                                                 memory=context.config.misc_mem, disk=context.config.misc_disk)
            bam_ids.append(surject_job.rv())
            bam_idx_ids.append(None)
            bam_names.append(gam_name + '-surject')
    
    if bam_ids:
        for bam_id, bam_idx_id, bam_name in zip(bam_ids, bam_idx_ids, bam_names):
            if not bam_idx_id:
                bam_index_job = child_job.addChildJobFn(run_bam_index, context, bam_id, bam_name,
                                                        cores=context.config.calling_cores,
                                                        memory=context.config.calling_mem,
                                                        disk=context.config.calling_disk)
                sorted_bam_id = bam_index_job.rv(0)
                sorted_bam_idx_id = bam_index_job.rv(1)
            else:
                bam_index_job = Job()
                child_job.addChild(bam_index_job)
                sorted_bam_id = bam_id
                sorted_bam_idx_id = bam_idx_id                

            bam_caller_infos = []
            if freebayes:
                bam_caller_infos.append(('freebayes', ['--genotype-qualities'], '-fb'))
            if platypus:
                bam_caller_infos.append(('platypus', ['--mergeClusteredVariants=1'], '-plat'))
                
            for bam_caller, bam_caller_opts, bam_caller_tag in bam_caller_infos:

                bam_caller_out_name = '{}{}'.format(bam_name, bam_caller_tag)
                bam_caller_job = bam_index_job.addFollowOnJobFn(run_all_bam_caller, context, caller_fasta_id,
                                                                sorted_bam_id, sorted_bam_idx_id, sample_name,
                                                                chroms, vcf_offsets,
                                                                out_name = bam_caller_out_name,
                                                                bam_caller = bam_caller,
                                                                bam_caller_opts = bam_caller_opts,
                                                                cores=context.config.misc_cores,
                                                                memory=context.config.misc_mem,
                                                                disk=context.config.misc_disk)
                bam_caller_vcf_tbi_id_pair = (bam_caller_job.rv(0), bam_caller_job.rv(1))
                timing_result = bam_caller_job.rv(2)

                if bed_id:

                    eval_results[bam_caller_out_name]["clipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name,
                                                        score_field='GQ', cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()
                    if happy:
                        happy_results[bam_caller_out_name]["clipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name,
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()

                    if sveval:
                        sveval_results[bam_caller_out_name]["clipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id,
                                                        min_sv_len=min_sv_len,
                                                        max_sv_len=max_sv_len,
                                                        sv_overlap=sv_overlap,
                                                        sv_region_overlap=sv_region_overlap,
                                                        bed_id=bed_id,
                                                        ins_ref_len=ins_ref_len,
                                                        del_min_rol=del_min_rol,
                                                        ins_seq_comp=ins_seq_comp, 
                                                        out_name=bam_caller_out_name,
                                                        fasta_path = 'ref.fasta',
                                                        fasta_id = vcfeval_fasta_id,
                                                        normalize = normalize,
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()                    

                if not clip_only:
                    # Also do unclipped

                    eval_results[bam_caller_out_name]["unclipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, None,
                                                        out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped',
                                                        score_field='GQ', cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()
                    if happy:
                        happy_results[bam_caller_out_name]["unclipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                        vcfeval_fasta_id, None,
                                                        out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped',
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()

                    if sveval:
                        sveval_results[bam_caller_out_name]["unclipped"] = \
                        bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair,
                                                        truth_vcf_id, truth_vcf_tbi_id,
                                                        min_sv_len=min_sv_len,
                                                        max_sv_len=max_sv_len,
                                                        sv_overlap=sv_overlap,
                                                        sv_region_overlap=sv_region_overlap,
                                                        bed_id=None,
                                                        ins_ref_len=ins_ref_len,
                                                        del_min_rol=del_min_rol,
                                                        ins_seq_comp=ins_seq_comp, 
                                                        out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped',
                                                        fasta_path = 'ref.fasta',
                                                        fasta_id = vcfeval_fasta_id,
                                                        normalize = normalize,
                                                        cores=context.config.vcfeval_cores,
                                                        memory=context.config.vcfeval_mem,
                                                        disk=context.config.vcfeval_disk).rv()                        

                vcf_tbi_id_pairs.append(bam_caller_vcf_tbi_id_pair)
                timing_results.append(timing_result)
                names.append(bam_caller_out_name)

    if gam_ids:
        for gam_id, gam_idx_id, gam_name, xg_id in zip(gam_ids, gam_idx_ids, gam_names, xg_ids):
            if call:
                out_name = '{}{}'.format(gam_name, '-call')
                
                if context.config.filter_opts:
                    filter_job = Job.wrapJobFn(run_filtering, context,
                                               graph_id=xg_id,
                                               graph_basename = 'graph.xg',
                                               gam_id=gam_id,
                                               gam_basename = 'aln.gam',
                                               filter_opts = context.config.filter_opts,
                                               cores=context.config.calling_cores,
                                               memory=context.config.calling_mem,
                                               disk=context.config.calling_disk)
                    gam_id = filter_job.rv()
                
                call_job = Job.wrapJobFn(run_chunked_calling, context,
                                         graph_id=xg_id,
                                         graph_basename='graph.xg',
                                         gam_id=gam_id,
                                         gam_basename='aln.gam',
                                         batch_input=None,
                                         snarls_id=None,
                                         genotype_vcf_id=None,
                                         genotype_tbi_id=None,
                                         sample=sample_name,
                                         augment=not recall,
                                         connected_component_chunking=False,
                                         output_format='pg',
                                         min_augment_coverage=min_augment_coverage,
                                         expected_coverage=None,
                                         min_mapq=min_mapq,
                                         min_baseq=min_baseq,
                                         ref_paths=chroms,
                                         ref_path_chunking=False,
                                         min_call_support=None,
                                         vcf_offsets=vcf_offsets,
                                         cores=context.config.misc_cores,
                                         memory=context.config.misc_mem,
                                         disk=context.config.misc_disk)

                if context.config.filter_opts:
                    child_job.addChild(filter_job)
                    filter_job.addFollowOn(call_job)
                else:
                    child_job.addChild(call_job)
                    
                vcf_tbi_id_pair = (call_job.rv(0), call_job.rv(1))
                #timing_result = call_job.rv(2)
                timing_result = TimeTracker()

                if not vcfeval_score_field:
                    score_field = 'QUAL'
                else:
                    score_field = vcfeval_score_field

                if bed_id:
                    eval_results[out_name]["clipped"] = \
                        call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, bed_id, out_name=out_name,
                                                  score_field=score_field).rv()
                    if happy:
                        happy_results[out_name]["clipped"] = \
                        call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, bed_id, out_name=out_name).rv()

                    if sveval:
                        sveval_results[out_name]["clipped"] = \
                        call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id,
                                                  min_sv_len=min_sv_len,
                                                  max_sv_len=max_sv_len,
                                                  sv_overlap=sv_overlap,
                                                  sv_region_overlap=sv_region_overlap,
                                                  ins_ref_len=ins_ref_len,
                                                  del_min_rol=del_min_rol,
                                                  ins_seq_comp=ins_seq_comp, 
                                                  bed_id = bed_id, out_name=out_name,
                                                  fasta_path = 'ref.fasta',
                                                  fasta_id = vcfeval_fasta_id,
                                                  normalize = normalize).rv()

                if not clip_only:
                    # Also do unclipped
                    eval_results[out_name]["unclipped"] = \
                        call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, None,
                                                  out_name=out_name if not bed_id else out_name + '-unclipped',
                                                  score_field=score_field).rv()
                    if happy:
                        happy_results[out_name]["unclipped"] = \
                        call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta',
                                                  vcfeval_fasta_id, None,
                                                  out_name=out_name if not bed_id else out_name + '-unclipped').rv()

                    if sveval:
                        sveval_results[out_name]["unclipped"] = \
                        call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair,
                                                  truth_vcf_id, truth_vcf_tbi_id,
                                                  min_sv_len=min_sv_len,
                                                  max_sv_len=max_sv_len,
                                                  sv_overlap=sv_overlap,
                                                  sv_region_overlap=sv_region_overlap,
                                                  bed_id = None,
                                                  ins_ref_len=ins_ref_len,
                                                  del_min_rol=del_min_rol,
                                                  ins_seq_comp=ins_seq_comp, 
                                                  out_name=out_name if not bed_id else out_name + '-unclipped',
                                                  fasta_path = 'ref.fasta',
                                                  fasta_id = vcfeval_fasta_id,
                                                  normalize = normalize).rv()
                            
                    
                vcf_tbi_id_pairs.append(vcf_tbi_id_pair)
                timing_results.append(timing_result)
                names.append(out_name)            


    calleval_results = child_job.addFollowOnJobFn(run_calleval_results, context, names,
                                                  vcf_tbi_id_pairs, eval_results, happy_results, sveval_results,
                                                  timing_results, plot_sets,
                                                  cores=context.config.misc_cores,
                                                  memory=context.config.misc_mem,
                                                  disk=context.config.misc_disk).rv()

    return calleval_results, names, vcf_tbi_id_pairs, eval_results
示例#14
0
def run_chunked_calling(job,
                        context,
                        graph_id,
                        graph_basename,
                        gam_id,
                        gam_basename,
                        batch_input=None,
                        snarls_id=None,
                        genotype_vcf_id=None,
                        genotype_tbi_id=None,
                        sample=None,
                        augment=False,
                        connected_component_chunking=False,
                        output_format=None,
                        min_augment_coverage=None,
                        expected_coverage=None,
                        min_mapq=None,
                        min_baseq=None,
                        ref_paths=[],
                        ref_path_chunking=True,
                        min_call_support=None,
                        vcf_offsets={},
                        gam_chunking=False):

    # simple way to keep follow-ons down the tree
    child_job = Job()
    job.addChild(child_job)

    out_vcf_name = remove_ext(graph_basename)
    if sample:
        out_vcf_name += '_' + sample

    # base case: only one input
    if batch_input is None:
        # chunk if necessary
        if connected_component_chunking or ref_path_chunking:

            chunk_job = child_job.addChildJobFn(
                run_chunking,
                context,
                graph_id=graph_id,
                graph_basename=graph_basename,
                chunk_paths=ref_paths,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                gam_id=gam_id if gam_chunking else None,
                to_outstore=False,
                cores=context.config.chunk_cores,
                memory=context.config.chunk_mem,
                disk=context.config.chunk_disk)

            batch_input = chunk_job.rv()

            # recurse on chunks
            recurse_job = child_job.addFollowOnJobFn(
                run_chunked_calling,
                context,
                graph_id=None,
                graph_basename=graph_basename,
                gam_id=gam_id,
                gam_basename=gam_basename,
                batch_input=batch_input,
                snarls_id=snarls_id,
                genotype_vcf_id=genotype_vcf_id,
                genotype_tbi_id=genotype_tbi_id,
                sample=sample,
                augment=augment,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                min_augment_coverage=min_augment_coverage,
                expected_coverage=expected_coverage,
                min_mapq=min_mapq,
                min_baseq=min_baseq,
                ref_paths=ref_paths,
                ref_path_chunking=ref_path_chunking,
                min_call_support=min_call_support,
                vcf_offsets=vcf_offsets,
                gam_chunking=gam_chunking)
            return recurse_job.rv()
        else:
            # convert if we're augmenting and not chunking
            if augment and os.path.splitext(
                    graph_basename)[1] != '.' + output_format:
                convert_job = child_job.addChildJobFn(
                    run_convert,
                    context,
                    graph_id=graph_id,
                    graph_basename=graph_basename,
                    output_format=output_format,
                    disk=context.config.calling_disk)
                graph_id = convert_job.rv()
                graph_basename = os.path.splitext(
                    graph_basename)[0] + '.' + output_format
                # todo: clean up
                next_job = Job()
                child_job.addFollowOn(next_job)
                child_job = next_job

            #phony up chunk output for single input
            batch_input = {'all': [graph_id, graph_basename]}
            if gam_id:
                batch_input['all'] += [gam_id, gam_basename]

    # run the calling on each chunk
    assert batch_input

    call_results = []
    in_gam_id = gam_id
    in_gam_basename = gam_basename
    for chunk_name, chunk_results in list(batch_input.items()):
        calling_root_job = Job()
        child_job.addChild(calling_root_job)

        graph_id = chunk_results[0]
        graph_basename = chunk_results[1]
        if gam_chunking:
            gam_id = chunk_results[2]
            gam_basename = chunk_results[3]
        else:
            gam_id = in_gam_id
            gam_basename = in_gam_basename

        if augment:
            augment_job = calling_root_job.addChildJobFn(
                run_augmenting,
                context,
                graph_id=graph_id,
                graph_basename=graph_basename,
                gam_id=gam_id,
                gam_basename=gam_basename,
                augment_gam=True,
                min_augment_coverage=min_augment_coverage,
                expected_coverage=expected_coverage,
                min_mapq=min_mapq,
                min_baseq=min_baseq,
                to_outstore=True,
                cores=context.config.augment_cores,
                memory=context.config.augment_mem,
                disk=context.config.augment_disk)
            graph_id = augment_job.rv(0)
            graph_basename = os.path.splitext(graph_basename)[
                0] + '-aug' + os.path.splitext(graph_basename)[1]
            gam_id = augment_job.rv(1)
            gam_basename = os.path.splitext(
                gam_basename)[0] + '-aug' + os.path.splitext(gam_basename)[1]

        # When path chunking, we subset our reference paths down to the current path
        if ref_path_chunking:
            ref_path = [chunk_name]
        else:
            ref_path = ref_paths

        calling_job = calling_root_job.addFollowOnJobFn(
            run_calling,
            context,
            graph_id=graph_id,
            graph_basename=graph_basename,
            gam_id=gam_id,
            gam_basename=gam_basename,
            snarls_id=snarls_id,
            genotype_vcf_id=genotype_vcf_id,
            genotype_tbi_id=genotype_tbi_id,
            sample=sample,
            expected_coverage=expected_coverage,
            min_mapq=min_mapq,
            ref_paths=ref_path,
            min_call_support=min_call_support,
            vcf_offsets=vcf_offsets,
            to_outstore=False,
            cores=context.config.calling_cores,
            memory=context.config.calling_mem,
            disk=context.config.calling_disk)

        call_results.append((chunk_name, calling_job.rv()))

    concat_job = child_job.addFollowOnJobFn(run_concat_vcfs,
                                            context,
                                            out_name=out_vcf_name,
                                            vcf_ids=None,
                                            tbi_ids=None,
                                            write_to_outstore=True,
                                            call_timers_lists=[],
                                            batch_data=call_results)

    return concat_job.rv()
示例#15
0
def run_whole_alignment(job, context, fastq, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper,
                        indexes, reads_chunk_ids,
                        bam_output=False, surject=False, gbwt_penalty=None, validate=False, fasta_dict_id=None):
    """
    align all fastq chunks in parallel
    
    Takes a dict from index type to index file ID. Some indexes are extra and
    specifying them will change mapping behavior.
    
    Returns a list of per-contig GAMs, the total allignment runtime, and a list
    of per-contig BAM file IDs (which is only nonempty when surject is true).
    
    """
    
    # this will be a list of lists.
    # gam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges)
    # for the ith gam chunk (generated from fastq shard i)
    gam_chunk_file_ids = []
    gam_chunk_running_times = []
    # depending on bam_output and surject options, we can make bam_output too
    bam_chunk_file_ids = []

    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)):
        #Run graph alignment on each fastq chunk
        chunk_alignment_job = child_job.addChildJobFn(run_chunk_alignment, context, gam_input_reads, bam_input_reads,
                                                      sample_name,
                                                      interleaved, mapper, chunk_filename_ids, chunk_id,
                                                      indexes,
                                                      bam_output=bam_output,
                                                      gbwt_penalty=gbwt_penalty,
                                                      validate=validate,
                                                      fasta_dict_id=fasta_dict_id,
                                                      cores=context.config.alignment_cores, memory=context.config.alignment_mem,
                                                      disk=context.config.alignment_disk)
        if not bam_output:
            gam_chunk_file_ids.append(chunk_alignment_job.rv(0))
        else:
            bam_chunk_file_ids.append(chunk_alignment_job.rv(0))
        gam_chunk_running_times.append(chunk_alignment_job.rv(1))


    if not bam_output:
        merge_gams_job = child_job.addFollowOnJobFn(run_merge_gams, context, sample_name, indexes.get('id_ranges'), gam_chunk_file_ids,
                                                    gam_chunk_running_times,
                                                    cores=context.config.misc_cores,
                                                    memory=context.config.misc_mem, disk=context.config.misc_disk)
        gam_chrom_ids = merge_gams_job.rv(0)
        gam_chunk_time = merge_gams_job.rv(1)
        bam_chrom_ids = []
    else:
        gam_chrom_ids = []
        gam_chunk_time = None
        merge_bams_job = child_job.addFollowOnJobFn(run_merge_bams, context, sample_name, bam_chunk_file_ids,
                                                        cores=context.config.misc_cores,
                                                        memory=context.config.misc_mem, disk=context.config.misc_disk)
        split_bams_job = merge_bams_job.addFollowOnJobFn(split_bam_into_chroms, context, indexes.get('id_ranges'), merge_bams_job.rv(),
                                                            cores=context.config.alignment_cores, memory=context.config.alignment_mem,
                                                            disk=context.config.alignment_disk)
        bam_chrom_ids = split_bams_job.rv()

    if surject:
        interleaved_surject = interleaved or (fastq and len(fastq) == 2)
        zip_job = child_job.addFollowOnJobFn(run_zip_surject_input, context, gam_chunk_file_ids)
        xg_id = indexes['xg-surject'] if 'xg-surject' in indexes else indexes['xg']
        bam_chrom_ids = [zip_job.addFollowOnJobFn(run_whole_surject, context, zip_job.rv(), sample_name + '-surject',
                                                  interleaved_surject, xg_id, []).rv()]

    return gam_chrom_ids, gam_chunk_time, bam_chrom_ids
示例#16
0
def run_chunked_calling(job, context, chunk_infos, genotype, recall, snarls_id, pack_support, old_call, call_timers):
    """
    spawn a calling job for each chunk then merge them together
    """
    # to encapsulate everything under this job
    child_job = Job()
    job.addChild(child_job)

    path_names = set()

    # If no chunking and many paths, we augment once first and not before calling
    # so we don't waste resources augmenting the same graph again and again
    # Note: should only do this when len(chunk_infos) > 1, but leaving as is so the tests hit it!
    if context.config.call_chunk_size == 0:
        chunk_info = chunk_infos[0]
        augment_job = child_job.addChildJobFn(
            run_vg_call,
            context,
            chunk_info['sample'],
            chunk_info['vg_id'],
            chunk_info['gam_id'],
            xg_id = chunk_info['xg_id'],
            path_names = [chunk_info['chrom']],
            seq_names = [chunk_info['chrom']],
            seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']],
            seq_lengths = [chunk_info['path_size']],
            chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']),
            genotype = genotype,
            recall = recall,
            clip_info = chunk_info,
            augment_only = True,
            pack_support = pack_support,
            alt_gam_id = chunk_info['alt_gam_id'],
            old_call = old_call,
            cores=context.config.calling_cores,
            memory=context.config.calling_mem, disk=context.config.calling_disk)
        augment_results = augment_job.rv()
        next_job = Job()
        augment_job.addFollowOn(next_job)
        child_job = next_job
    else:
        augment_results = None
    
    clip_file_ids = []
    for chunk_info in chunk_infos:
        path_names.add(chunk_info['chrom'])

        # Run vg call
        call_job = child_job.addChildJobFn(
            run_vg_call,
            context,
            chunk_info['sample'],
            chunk_info['vg_id'],
            chunk_info['gam_id'],
            xg_id = chunk_info['xg_id'],
            path_names = [chunk_info['chrom']],
            seq_names = [chunk_info['chrom']],
            seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']],
            seq_lengths = [chunk_info['path_size']],
            chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']),
            genotype = genotype,
            recall = recall,
            clip_info = chunk_info,
            alt_gam_id = chunk_info['alt_gam_id'],
            genotype_vcf_id = chunk_info['genotype_vcf_id'],
            genotype_tbi_id = chunk_info['genotype_tbi_id'],
            snarls_id = snarls_id,            
            pack_support = pack_support,
            old_call = old_call,            
            augment_results = augment_results,
            cores=context.config.calling_cores,
            memory=context.config.calling_mem, disk=context.config.calling_disk)
        vcf_id, call_timer = call_job.rv(0), call_job.rv(1)
        
        clip_file_ids.append(vcf_id)
        call_timers.append(call_timer)

    tag = list(path_names)[0] if len(path_names) == 1 else 'chroms'
        
    merge_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, tag,
                                           clip_file_ids,
                                           cores=context.config.call_chunk_cores,
                                           memory=context.config.call_chunk_mem,
                                           disk=context.config.call_chunk_disk)
        
    vcf_out_file_id = merge_job.rv(0)
    tbi_out_file_id = merge_job.rv(1)
    
    return vcf_out_file_id, tbi_out_file_id, call_timers
示例#17
0
def run_all_calling2(job, context, xg_file_id, chr_gam_ids, chr_gam_idx_ids, chroms, path_sizes, vcf_offsets, sample_name,
                     genotype=False, out_name=None, recall=False, alt_gam_id=None, alt_gai_id=None,
                     genotype_vcf_id=None, genotype_tbi_id=None, id_ranges_id=None, snarls_id=None, pack_support=False,
                     old_call=False):
    """
    Call all the chromosomes and return a merged up vcf/tbi pair
    """
    # we make a child job so that all calling is encapsulated in a top-level job
    child_job = Job()
    job.addChild(child_job)
    vcf_ids = []
    tbi_ids = []
    call_timers_lists = []
    assert len(chr_gam_ids) > 0
    if not chr_gam_idx_ids:
        chr_gam_idx_ids = [None] * len(chr_gam_ids)
    if not chroms:
        chroms = [name for name in path_sizes.keys() if path_sizes[name] > 0]
    assert len(chr_gam_ids) == len(chr_gam_idx_ids)
    # id ranges deactivates path chunking
    if id_ranges_id:
        context.config.call_chunk_size = (2 << 30) - 1
        context.config.overlap = 0
    for i in range(len(chr_gam_ids)):
        alignment_file_id = chr_gam_ids[i]
        alignment_index_id = chr_gam_idx_ids[i]
        if len(chr_gam_ids) > 1:
            # 1 gam per chromosome
            chr_label = [chroms[i]]
            chr_offset = [vcf_offsets[i]] if vcf_offsets else [0]
        else:
            # single gam with one or more chromosomes
            chr_label = chroms
            chr_offset = vcf_offsets if vcf_offsets else [0] * len(chroms)
        chunk_job = child_job.addChildJobFn(run_chunking, context, xg_file_id,
                                            alignment_file_id, alignment_index_id, chr_label, chr_offset, path_sizes,
                                            sample_name, genotype=genotype, recall=recall,
                                            alt_gam_id=alt_gam_id, alt_gai_id=alt_gai_id,
                                            genotype_vcf_id=genotype_vcf_id,
                                            genotype_tbi_id=genotype_tbi_id,
                                            id_ranges_id=id_ranges_id,
                                            cores=context.config.call_chunk_cores,
                                            memory=context.config.call_chunk_mem,
                                            disk=context.config.call_chunk_disk)
        call_job = chunk_job.addFollowOnJobFn(run_chunked_calling, context, chunk_job.rv(0),
                                              genotype, recall, snarls_id, pack_support, old_call, chunk_job.rv(1),
                                              cores=context.config.misc_cores,
                                              memory=context.config.misc_mem,
                                              disk=context.config.misc_disk)
        vcf_ids.append(call_job.rv(0))
        tbi_ids.append(call_job.rv(1))
        call_timers_lists.append(call_job.rv(2))
        
    if not out_name:
        out_name = sample_name
    return child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, vcf_ids, tbi_ids,
                                      write_to_outstore = True,
                                      call_timers_lists = call_timers_lists,
                                      cores=context.config.call_chunk_cores,
                                      memory=context.config.call_chunk_mem,
                                      disk=context.config.call_chunk_disk).rv()
示例#18
0
文件: vg_sim.py 项目: xchang1/toil-vg
def run_sim(job,
            context,
            num_reads,
            gam,
            fastq_out,
            seed,
            sim_chunks,
            xg_file_ids,
            xg_annot_file_id,
            tag_bed_ids=[],
            paths=[],
            drop_contigs_matching=[],
            fastq_id=None,
            out_name=None,
            validate=False):
    """  
    run a bunch of simulation child jobs, merge up their output as a follow on
    """
    sim_out_id_infos = []

    # no seed specified, we choose one at random
    if seed is None:
        seed = random.randint(0, 2147483647)
        RealtimeLogger.info(
            'No seed specifed, choosing random value = {}'.format(seed))

    # encapsulate follow-on
    child_job = Job()
    job.addChild(child_job)

    # we can have more than one xg file if we've split our input graphs up
    # into haplotypes
    for xg_i, xg_file_id in enumerate(xg_file_ids):
        file_reads = num_reads / len(xg_file_ids)
        if xg_file_id == xg_file_ids[-1]:
            file_reads += num_reads % len(xg_file_ids)

        # Define a seed base for this set of chunks, leaving space for each chunk before the next seed base
        seed_base = seed + xg_i * sim_chunks

        # each element is either reads_chunk_id or (gam_chunk_id, true_pos_chunk_id)
        # if --gam not specified
        for chunk_i in range(sim_chunks):
            chunk_reads = file_reads / sim_chunks
            if chunk_i == sim_chunks - 1:
                chunk_reads += file_reads % sim_chunks
            sim_out_id_info = child_job.addChildJobFn(
                run_sim_chunk,
                context,
                gam,
                seed_base,
                xg_file_id,
                xg_annot_file_id,
                chunk_reads,
                chunk_i,
                xg_i,
                tag_bed_ids=tag_bed_ids,
                paths=paths,
                drop_contigs_matching=drop_contigs_matching,
                fastq_id=fastq_id,
                validate=validate,
                cores=context.config.sim_cores,
                memory=context.config.sim_mem,
                disk=context.config.sim_disk).rv()
            sim_out_id_infos.append(sim_out_id_info)

    merge_job = child_job.addFollowOnJobFn(run_merge_sim_chunks,
                                           context,
                                           gam,
                                           sim_out_id_infos,
                                           out_name,
                                           cores=context.config.sim_cores,
                                           memory=context.config.sim_mem,
                                           disk=context.config.sim_disk)

    merged_gam_id, true_id = merge_job.rv(0), merge_job.rv(1)

    if fastq_out:
        fastq_job = merge_job.addFollowOnJobFn(
            run_gam_to_fastq,
            context,
            merged_gam_id,
            False,
            out_name=out_name if out_name else 'sim',
            out_store=True,
            cores=context.config.sim_cores,
            memory=context.config.sim_mem,
            disk=context.config.sim_disk)
        merged_fq_id = fastq_job.rv(0)

    return merged_gam_id, true_id
示例#19
0
def gatk_germline_pipeline(job, samples, config):
    """
    Runs the GATK best practices pipeline for germline SNP and INDEL discovery.

    Steps in Pipeline
    0: Generate and preprocess BAM
        - Uploads processed BAM to output directory
    1: Call Variants using HaplotypeCaller
        - Uploads GVCF
    2: Genotype VCF
        - Uploads VCF
    3: Filter Variants using either "hard filters" or VQSR
        - Uploads filtered VCF

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param list[GermlineSample] samples: List of GermlineSample namedtuples
    :param Namespace config: Input parameters and reference FileStoreIDs
        Requires the following config attributes:
        config.genome_fasta         FilesStoreID for reference genome fasta file
        config.genome_fai           FilesStoreID for reference genome fasta index file
        config.genome_dict          FilesStoreID for reference genome sequence dictionary file
        config.cores                Number of cores for each job
        config.xmx                  Java heap size in bytes
        config.suffix               Suffix added to output filename
        config.output_dir           URL or local path to output directory
        config.ssec                 Path to key file for SSE-C encryption
        config.joint_genotype       If True, then joint genotype and filter cohort
        config.hc_output            URL or local path to HaplotypeCaller output for testing
    :return: Dictionary of filtered VCF FileStoreIDs
    :rtype: dict
    """
    require(len(samples) > 0, 'No samples were provided!')

    # Get total size of genome reference files. This is used for configuring disk size.
    genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size

    # 0: Generate processed BAM and BAI files for each sample
    # group preprocessing and variant calling steps in empty Job instance
    group_bam_jobs = Job()
    gvcfs = {}
    for sample in samples:
        # 0: Generate processed BAM and BAI files for each sample
        get_bam = group_bam_jobs.addChildJobFn(prepare_bam,
                                               sample.uuid,
                                               sample.url,
                                               config,
                                               paired_url=sample.paired_url,
                                               rg_line=sample.rg_line)

        # 1: Generate per sample gvcfs {uuid: gvcf_id}
        # The HaplotypeCaller disk requirement depends on the input bam, bai, the genome reference
        # files, and the output GVCF file. The output GVCF is smaller than the input BAM file.
        hc_disk = PromisedRequirement(lambda bam, bai, ref_size:
                                      2 * bam.size + bai.size + ref_size,
                                      get_bam.rv(0),
                                      get_bam.rv(1),
                                      genome_ref_size)

        get_gvcf = get_bam.addFollowOnJobFn(gatk_haplotype_caller,
                                            get_bam.rv(0),
                                            get_bam.rv(1),
                                            config.genome_fasta, config.genome_fai, config.genome_dict,
                                            annotations=config.annotations,
                                            cores=config.cores,
                                            disk=hc_disk,
                                            memory=config.xmx,
                                            hc_output=config.hc_output)
        # Store cohort GVCFs in dictionary
        gvcfs[sample.uuid] = get_gvcf.rv()

        # Upload individual sample GVCF before genotyping to a sample specific output directory
        vqsr_name = '{}{}.g.vcf'.format(sample.uuid, config.suffix)
        get_gvcf.addChildJobFn(output_file_job,
                               vqsr_name,
                               get_gvcf.rv(),
                               os.path.join(config.output_dir, sample.uuid),
                               s3_key_path=config.ssec,
                               disk=PromisedRequirement(lambda x: x.size, get_gvcf.rv()))

    # VQSR requires many variants in order to train a decent model. GATK recommends a minimum of
    # 30 exomes or one large WGS sample:
    # https://software.broadinstitute.org/gatk/documentation/article?id=3225

    filtered_vcfs = {}
    if config.joint_genotype:
        # Need to configure joint genotype in a separate function to resolve promises
        filtered_vcfs = group_bam_jobs.addFollowOnJobFn(joint_genotype_and_filter,
                                                        gvcfs,
                                                        config).rv()

    # If not joint genotyping, then iterate over cohort and genotype and filter individually.
    else:
        for uuid, gvcf_id in gvcfs.iteritems():
            filtered_vcfs[uuid] = group_bam_jobs.addFollowOnJobFn(genotype_and_filter,
                                                                  {uuid: gvcf_id},
                                                                  config).rv()

    job.addChild(group_bam_jobs)
    return filtered_vcfs
示例#20
0
def run_chunked_augmenting(job,
                           context,
                           graph_id,
                           graph_basename,
                           gam_id,
                           gam_basename,
                           batch_input=None,
                           all_path_components=False,
                           chunk_paths=[],
                           connected_component_chunking=False,
                           output_format=None,
                           augment_gam=False,
                           min_augment_coverage=None,
                           expected_coverage=None,
                           min_mapq=None,
                           min_baseq=None,
                           to_outstore=False):
    """
    Run a chunking job (if desired), then augment the results
    """

    # base case: only one input
    if batch_input is None:
        # chunk if necessary
        if all_path_components or connected_component_chunking or len(
                chunk_paths) > 1:
            child_job = Job()
            job.addChild(child_job)
            chunk_job = child_job.addChildJobFn(
                run_chunking,
                context,
                graph_id=graph_id,
                graph_basename=graph_basename,
                chunk_paths=chunk_paths,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                gam_id=gam_id,
                to_outstore=False,
                cores=context.config.chunk_cores,
                memory=context.config.chunk_mem,
                disk=context.config.chunk_disk)
            batch_input = chunk_job.rv()

            # recurse on chunks
            recurse_job = child_job.addFollowOnJobFn(
                run_chunked_augmenting,
                context,
                graph_id=None,
                graph_basename=None,
                gam_id=None,
                gam_basename=None,
                batch_input=batch_input,
                all_path_components=all_path_components,
                chunk_paths=chunk_paths,
                connected_component_chunking=connected_component_chunking,
                output_format=output_format,
                augment_gam=augment_gam,
                min_augment_coverage=min_augment_coverage,
                expected_coverage=expected_coverage,
                min_mapq=min_mapq,
                min_baseq=min_baseq,
                to_outstore=to_outstore)
            return recurse_job.rv()
        else:
            #phony up chunk output for single input
            batch_input = {'all': [graph_id, graph_basename]}
            if gam_id:
                batch_input['all'] += [gam_id, gam_basename]

    # run the augmenting on each chunk
    assert batch_input

    augment_results = []
    for chunk_name, chunk_results in list(batch_input.items()):
        augment_job = job.addChildJobFn(
            run_augmenting,
            context,
            graph_id=chunk_results[0],
            graph_basename=chunk_results[1],
            gam_id=chunk_results[2],
            gam_basename=chunk_results[3],
            augment_gam=augment_gam,
            min_augment_coverage=min_augment_coverage,
            expected_coverage=expected_coverage,
            min_mapq=min_mapq,
            min_baseq=min_baseq,
            to_outstore=to_outstore,
            cores=context.config.augment_cores,
            memory=context.config.augment_mem,
            disk=context.config.augment_disk)

        augment_results.append((chunk_name, augment_job.rv()))

    return augment_results
示例#21
0
def gatk_germline_pipeline(job, samples, config):
    """
    Runs the GATK best practices pipeline for germline SNP and INDEL discovery.

    Steps in Pipeline
    0: Generate and preprocess BAM
        - Uploads processed BAM to output directory
    1: Call Variants using HaplotypeCaller
        - Uploads GVCF
    2: Genotype VCF
        - Uploads VCF
    3: Filter Variants using either "hard filters" or VQSR
        - Uploads filtered VCF

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param list[GermlineSample] samples: List of GermlineSample namedtuples
    :param Namespace config: Input parameters and reference FileStoreIDs
        Requires the following config attributes:
        config.genome_fasta         FilesStoreID for reference genome fasta file
        config.genome_fai           FilesStoreID for reference genome fasta index file
        config.genome_dict          FilesStoreID for reference genome sequence dictionary file
        config.cores                Number of cores for each job
        config.xmx                  Java heap size in bytes
        config.suffix               Suffix added to output filename
        config.output_dir           URL or local path to output directory
        config.ssec                 Path to key file for SSE-C encryption
        config.joint_genotype       If True, then joint genotype and filter cohort
        config.hc_output            URL or local path to HaplotypeCaller output for testing
    :return: Dictionary of filtered VCF FileStoreIDs
    :rtype: dict
    """
    require(len(samples) > 0, 'No samples were provided!')

    # Get total size of genome reference files. This is used for configuring disk size.
    genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size

    # 0: Generate processed BAM and BAI files for each sample
    # group preprocessing and variant calling steps in empty Job instance
    group_bam_jobs = Job()
    gvcfs = {}
    for sample in samples:
        # 0: Generate processed BAM and BAI files for each sample
        get_bam = group_bam_jobs.addChildJobFn(prepare_bam,
                                               sample.uuid,
                                               sample.url,
                                               config,
                                               paired_url=sample.paired_url,
                                               rg_line=sample.rg_line)

        # 1: Generate per sample gvcfs {uuid: gvcf_id}
        # The HaplotypeCaller disk requirement depends on the input bam, bai, the genome reference
        # files, and the output GVCF file. The output GVCF is smaller than the input BAM file.
        hc_disk = PromisedRequirement(
            lambda bam, bai, ref_size: 2 * bam.size + bai.size + ref_size,
            get_bam.rv(0), get_bam.rv(1), genome_ref_size)

        get_gvcf = get_bam.addFollowOnJobFn(gatk_haplotype_caller,
                                            get_bam.rv(0),
                                            get_bam.rv(1),
                                            config.genome_fasta,
                                            config.genome_fai,
                                            config.genome_dict,
                                            annotations=config.annotations,
                                            cores=config.cores,
                                            disk=hc_disk,
                                            memory=config.xmx,
                                            hc_output=config.hc_output)
        # Store cohort GVCFs in dictionary
        gvcfs[sample.uuid] = get_gvcf.rv()

        # Upload individual sample GVCF before genotyping to a sample specific output directory
        vqsr_name = '{}{}.g.vcf'.format(sample.uuid, config.suffix)
        get_gvcf.addChildJobFn(output_file_job,
                               vqsr_name,
                               get_gvcf.rv(),
                               os.path.join(config.output_dir, sample.uuid),
                               s3_key_path=config.ssec,
                               disk=PromisedRequirement(
                                   lambda x: x.size, get_gvcf.rv()))

    # VQSR requires many variants in order to train a decent model. GATK recommends a minimum of
    # 30 exomes or one large WGS sample:
    # https://software.broadinstitute.org/gatk/documentation/article?id=3225

    filtered_vcfs = {}
    if config.joint_genotype:
        # Need to configure joint genotype in a separate function to resolve promises
        filtered_vcfs = group_bam_jobs.addFollowOnJobFn(
            joint_genotype_and_filter, gvcfs, config).rv()

    # If not joint genotyping, then iterate over cohort and genotype and filter individually.
    else:
        for uuid, gvcf_id in gvcfs.iteritems():
            filtered_vcfs[uuid] = group_bam_jobs.addFollowOnJobFn(
                genotype_and_filter, {
                    uuid: gvcf_id
                }, config).rv()

    job.addChild(group_bam_jobs)
    return filtered_vcfs