示例#1
0
def _set_align_split_size(data):
    """Set useful align_split_size, generating an estimate if it doesn't exist.

    We try to split on larger inputs and avoid too many pieces, aiming for size
    chunks of 5Gb or at most 50 maximum splits.

    The size estimate used in calculations is 20 million reads for ~5Gb.

    For UMI calculations we skip splitting since we're going to align and
    re-align after consensus.
    """
    target_size = 5  # Gb
    target_size_reads = 20  # million reads
    max_splits = 100  # Avoid too many pieces, causing merge memory problems
    val = dd.get_align_split_size(data)
    umi_consensus = dd.get_umi_consensus(data)
    if val is None:
        if not umi_consensus:
            total_size = 0  # Gb
            # Use original files if we might have reduced the size of our prepped files
            input_files = data.get("files_orig", []) if dd.get_save_diskspace(data) else data.get("files", [])
            for fname in input_files:
                if os.path.exists(fname):
                    total_size += os.path.getsize(fname) / (1024.0 * 1024.0 * 1024.0)
            # Only set if we have files and are bigger than the target size
            if total_size > target_size:
                data["config"]["algorithm"]["align_split_size"] = \
                  int(1e6 * _pick_align_split_size(total_size, target_size,
                                                   target_size_reads, max_splits))
    elif val:
        assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val
    return data
示例#2
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
            and not needs_trim and not get_downsample_params(data))
示例#3
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and
            not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
示例#4
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d)
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    return checkpoints
示例#5
0
def _prep_grabix_indexes(in_files, data):
    """Parallel preparation of grabix indexes for files.
    """
    # if we have gzipped but not bgzipped, add a fake index for CWL support
    # Also skips bgzip indexing if we don't need alignment splitting
    if _ready_gzip_fastq(in_files, data) and (not _ready_gzip_fastq(in_files, data, require_bgzip=True)
                                              or dd.get_align_split_size(data) is False):
        for in_file in in_files:
            with file_transaction(data, in_file + ".gbi") as tx_gbi_file:
                with open(tx_gbi_file, "w") as out_handle:
                    out_handle.write("Not grabix indexed; index added for compatibility.\n")
    else:
        items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in in_files if x]
        run_multicore(_grabix_index, items, data["config"])
    return data
示例#6
0
def _prep_grabix_indexes(in_files, data):
    """Parallel preparation of grabix indexes for files.
    """
    # if we have gzipped but not bgzipped, add a fake index for CWL support
    # Also skips bgzip indexing if we don't need alignment splitting
    if _ready_gzip_fastq(in_files, data) and (not _ready_gzip_fastq(in_files, data, require_bgzip=True) or
                                              dd.get_align_split_size(data) is False):
        for in_file in in_files:
            if not utils.file_exists(in_file + ".gbi"):
                with file_transaction(data, in_file + ".gbi") as tx_gbi_file:
                    with open(tx_gbi_file, "w") as out_handle:
                        out_handle.write("Not grabix indexed; index added for compatibility.\n")
    else:
        items = [[{"bgzip_file": x, "config": copy.deepcopy(data["config"])}] for x in in_files if x]
        run_multicore(_grabix_index, items, data["config"])
    return data
示例#7
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
示例#8
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["archive"] = any([dd.get_archive(d) for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
示例#9
0
def _set_align_split_size(data):
    """Set useful align_split_size, generating an estimate if it doesn't exist.

    We try to split on larger inputs and avoid too many pieces, aiming for size
    chunks of 5Gb or at most 50 maximum splits.

    The size estimate used in calculations is 20 million reads for ~5Gb.

    For UMI calculations we skip splitting since we're going to align and
    re-align after consensus.

    For CWL runs, we pick larger split sizes to avoid overhead of staging each chunk.
    """
    if cwlutils.is_cwl_run(data):
        target_size = 20  # Gb
        target_size_reads = 80  # million reads
    else:
        target_size = 5  # Gb
        target_size_reads = 20  # million reads
    max_splits = 100  # Avoid too many pieces, causing merge memory problems
    val = dd.get_align_split_size(data)
    umi_consensus = dd.get_umi_consensus(data)
    if val is None:
        if not umi_consensus:
            total_size = 0  # Gb
            # Use original files if we might have reduced the size of our prepped files
            input_files = data.get("files_orig", []) if dd.get_save_diskspace(data) else data.get("files", [])
            for fname in input_files:
                if os.path.exists(fname):
                    total_size += os.path.getsize(fname) / (1024.0 * 1024.0 * 1024.0)
            # Only set if we have files and are bigger than the target size
            if total_size > target_size:
                data["config"]["algorithm"]["align_split_size"] = \
                  int(1e6 * _pick_align_split_size(total_size, target_size,
                                                   target_size_reads, max_splits))
    elif val:
        assert not umi_consensus, "Cannot set align_split_size to %s with UMI conensus specified" % val
    return data