Пример #1
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples).items():
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = tz.get_in(["config", "algorithm", "variant_regions"], data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                out.append([data])
        assert len(out) == len(samples)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions)
    return out
Пример #2
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    # Unpack nested lists of samples grouped together
    if isinstance(samples[0], (list, tuple)) and len(samples[0]) == 1:
        samples = [x[0] for x in samples]
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"],
                                        "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(
            global_analysis_file, samples):
        global_no_analysis_file = os.path.join(
            os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples,
                                                  require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(
                    batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"][
                        "callable_regions"] = analysis_file
                    data["config"]["algorithm"][
                        "non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"][
                        "callable_count"] = pybedtools.BedTool(
                            analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"][
                        "callable_count"] = pybedtools.BedTool(
                            vr_file).count()
                highdepth_bed = tz.get_in(["regions", "highdepth"], data)
                if highdepth_bed:
                    data["config"]["algorithm"][
                        "highdepth_regions"] = highdepth_bed
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        assert len(out) == len(samples)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Пример #3
0
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    from bcbio.structural import cnvkit
    if all(not cnvkit.use_general_sv_bins(utils.to_single_data(x)) for x in items):
        return items
    items = [utils.to_single_data(x) for x in items]
    out = []
    for cnv_group in _group_by_cnv_method(multi.group_by_batch(items, False)):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            target_bed, anti_bed = cnvkit.targets_w_bins(cnv_group.region_file, cnv_group.access_file, size_calc_fn,
                                                         cnv_group.work_dir, data)
            if not data.get("regions"):
                data["regions"] = {}
            data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed}
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
Пример #4
0
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk}
    from bcbio.structural import cnvkit
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out = []
    for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            if cnvkit.use_general_sv_bins(data):
                target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group,
                                                                                           size_calc_fn)
                if not data.get("regions"):
                    data["regions"] = {}
                data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i),
                                           "gcannotated": gcannotated_tsv}
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
Пример #5
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples).items():
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                out.append([data])
        assert len(out) == len(samples)
        final_regions = pybedtools.BedTool(analysis_files[0])
        _analysis_block_stats(final_regions)
    return out
Пример #6
0
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    calcfns = {"cnvkit": _calculate_sv_bins_cnvkit, "gatk-cnv": _calculate_sv_bins_gatk}
    from bcbio.structural import cnvkit
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out = []
    for i, cnv_group in enumerate(_group_by_cnv_method(multi.group_by_batch(items, False))):
        size_calc_fn = MemoizedSizes(cnv_group.region_file, cnv_group.items).get_target_antitarget_bin_sizes
        for data in cnv_group.items:
            if cnvkit.use_general_sv_bins(data):
                target_bed, anti_bed, gcannotated_tsv = calcfns[cnvkit.bin_approach(data)](data, cnv_group,
                                                                                           size_calc_fn)
                if not data.get("regions"):
                    data["regions"] = {}
                data["regions"]["bins"] = {"target": target_bed, "antitarget": anti_bed, "group": str(i),
                                           "gcannotated": gcannotated_tsv}
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError("Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s" %
                             (sorted([dd.get_sample_name(utils.to_single_data(x)) for x in out]),
                              sorted([dd.get_sample_name(x) for x in items])))
    return out
Пример #7
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    samples = utils.unpack_worlds(samples)
    samples = cwlutils.unpack_tarballs(samples, samples[0])
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples, require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        # Ensure output order matches input order, consistency for CWL-based runs
        assert len(out) == len(samples)
        sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)}
        def by_input_index(xs):
            return sample_indexes[dd.get_sample_name(xs[0])]
        out.sort(key=by_input_index)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Пример #8
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    samples = utils.unpack_worlds(samples)
    samples = cwlutils.unpack_tarballs(samples, samples[0])
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples, require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        # Ensure output order matches input order, consistency for CWL-based runs
        assert len(out) == len(samples)
        sample_indexes = {dd.get_sample_name(d): i for i, d in enumerate(samples)}
        def by_input_index(xs):
            return sample_indexes[dd.get_sample_name(xs[0])]
        out.sort(key=by_input_index)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Пример #9
0
def combine_sample_regions(*samples):
    """Create batch-level sets of callable regions for multi-sample calling.

    Intersects all non-callable (nblock) regions from all samples in a batch,
    producing a global set of callable regions.
    """
    import pybedtools
    samples = [x[0] for x in samples]
    # back compatibility -- global file for entire sample set
    global_analysis_file = os.path.join(samples[0]["dirs"]["work"], "analysis_blocks.bed")
    if utils.file_exists(global_analysis_file) and not _needs_region_update(global_analysis_file, samples):
        global_no_analysis_file = os.path.join(os.path.dirname(global_analysis_file), "noanalysis_blocks.bed")
    else:
        global_analysis_file = None
    out = []
    analysis_files = []
    batches = []
    with shared.bedtools_tmpdir(samples[0]):
        for batch, items in vmulti.group_by_batch(samples, require_bam=False).items():
            batches.append(items)
            if global_analysis_file:
                analysis_file, no_analysis_file = global_analysis_file, global_no_analysis_file
            else:
                analysis_file, no_analysis_file = _combine_sample_regions_batch(batch, items)
            for data in items:
                vr_file = dd.get_variant_regions(data)
                if analysis_file:
                    analysis_files.append(analysis_file)
                    data["config"]["algorithm"]["callable_regions"] = analysis_file
                    data["config"]["algorithm"]["non_callable_regions"] = no_analysis_file
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(analysis_file).count()
                elif vr_file:
                    data["config"]["algorithm"]["callable_count"] = pybedtools.BedTool(vr_file).count()
                highdepth_bed = tz.get_in(["regions", "highdepth"], data)
                if highdepth_bed:
                    data["config"]["algorithm"]["highdepth_regions"] = highdepth_bed
                # attach a representative sample for calculating callable region
                if not data.get("work_bam"):
                    for x in items:
                        if x.get("work_bam"):
                            data["work_bam_callable"] = x["work_bam"]
                out.append([data])
        assert len(out) == len(samples)
        if len(analysis_files) > 0:
            final_regions = pybedtools.BedTool(analysis_files[0])
            _analysis_block_stats(final_regions, batches[0])
    return out
Пример #10
0
def calculate_sv_bins(*items):
    """Determine bin sizes and regions to use for samples.

    Unified approach to prepare regional bins for coverage calculations across
    multiple CNV callers. Splits into target and antitarget regions allowing
    callers to take advantage of both. Provides consistent target/anti-target
    bin sizes across batches.

    Uses callable_regions as the access BED file and mosdepth regions in
    variant_regions to estimate depth for bin sizes.
    """
    from bcbio.structural import cnvkit
    if all(not cnvkit.use_general_sv_bins(utils.to_single_data(x))
           for x in items):
        return items
    items = [utils.to_single_data(x) for x in items]
    out = []
    for batch, batch_items in multi.group_by_batch(items, False).items():
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(items[0]), "structural", "bins",
                         batch))
        access_file = tz.get_in(["config", "algorithm", "callable_regions"],
                                batch_items[0])
        cnv_file = get_base_cnv_regions(batch_items[0],
                                        work_dir,
                                        "transcripts100",
                                        include_gene_names=False)
        target_bin, anti_bin = _get_target_antitarget_bin_sizes(
            cnv_file, items)
        for data in batch_items:
            target_bed, anti_bed = cnvkit.targets_w_bins(
                cnv_file, access_file, target_bin, anti_bin, work_dir, data)
            if not data.get("regions"):
                data["regions"] = {}
            data["regions"]["bins"] = {
                "target": target_bed,
                "antitarget": anti_bed
            }
            out.append([data])
    if not len(out) == len(items):
        raise AssertionError(
            "Inconsistent samples in and out of SV bin calculation:\nout: %s\nin : %s"
            % (sorted(
                [dd.get_sample_name(utils.to_single_data(x))
                 for x in out]), sorted([dd.get_sample_name(x)
                                         for x in items])))
    return out