Пример #1
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"][
            "variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data)
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(
                data)
        clean_cov_bed = clean_file(dd.get_coverage(data),
                                   data,
                                   prefix="cov-",
                                   simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning(
                "Can't run Seq2C without a svregions or variant_regions BED file"
            )
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    return data
Пример #2
0
def _prep_bed(data, work_dir):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = clean_file(dd.get_variant_regions(data), data)

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError("BED file for Seq2C must be annotated with gene names, "
                             "however the input BED is 3-columns and we have no transcript "
                             "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file)

    return ready_file
Пример #3
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data)
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data)
        clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning("Can't run Seq2C without a svregions or variant_regions BED file")
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    return data
Пример #4
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions, vr_quantize, None),
                        ("sv_regions", bedutils.clean_file(regions.get_sv_bed(data), data), None, None),
                        ("coverage", bedutils.clean_file(dd.get_coverage(data), data), None, DEPTH_THRESHOLDS)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds)
                for attr in ("dist", "regions", "thresholds"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Пример #5
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Пример #6
0
def summarize(calls, data, items):
    """Summarize results from multiple callers into a single flattened BED file.

    Approach:
      - Combine all calls found in all files
      - Filter files retaining those present with multiple levels of support.
      - Remove calls in high depth regions.
      - Remove calls with ends overlapping exclusion regions like low complexity regions.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]),
                            [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            limit_file = shared.remove_highdepth_regions(filter_file, items)
            exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f]
            exclude_file = exclude_files[0] if len(exclude_files) > 0 else None
            if exclude_file:
                noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data)
            else:
                noexclude_file = limit_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep"))
            if utils.file_exists(noexclude_file):
                calls.append({"variantcaller": "sv-ensemble",
                              "input_beds": input_beds,
                              "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)})
    return calls
Пример #7
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Пример #8
0
def get_base_cnv_regions(data,
                         work_dir,
                         genome_default="transcripts1e4",
                         include_gene_names=True):
    """Retrieve set of target regions for CNV analysis.

    Subsets to extended transcript regions for WGS experiments to avoid
    long runtimes.
    """
    cov_interval = dd.get_coverage_interval(data)
    base_regions = get_sv_bed(data, include_gene_names=include_gene_names)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions near genes as targets
        if cov_interval == "genome":
            base_regions = get_sv_bed(data,
                                      genome_default,
                                      work_dir,
                                      include_gene_names=include_gene_names)
            if base_regions:
                base_regions = remove_exclude_regions(base_regions,
                                                      base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data)
    return bedutils.clean_file(base_regions, data)
Пример #9
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(
            bed_file):
        return {}
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-")
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(
            out_file, in_bam):
        return out_file
    cmdl = sambamba.make_command(
        data,
        "depth region",
        in_bam,
        cleaned_bed,
        depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    with file_transaction(out_file) as tx_out_file:
        message = "Calculating region coverage of {bed_file} in {in_bam}"
        do.run(cmdl + " -o " + tx_out_file, message.format(**locals()))
    logger.debug("Saved svprioritize coverage into " + out_file)
    return out_file
Пример #10
0
def summary(items):
    cutoff = 4  # coverage for completeness

    out_dir = utils.safe_makedir(
        os.path.join(items[0]["dirs"]["work"], "coverage"))
    clean_bed = bedutils.clean_file(
        tz.get_in(["config", "algorithm", "coverage"], items[0]), items[0])
    bed_file = _uniquify_bed_names(clean_bed, out_dir, items[0])
    batch = _get_group_batch(items)

    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
            cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
            do.run(cmd.format(**locals()), "Prep chanjo database")
            for data in items:
                sample = dd.get_sample_name(data)
                bam_file = data["work_bam"]
                cmd = (
                    "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} {bam_file} {bed_file} | "
                    "{chanjo} --db {tx_out_file} import")
                do.run(cmd.format(**locals()), "Chanjo coverage", data)
    out = []
    for data in items:
        data["coverage"] = {"summary": out_file}
        out.append([data])
    return out
Пример #11
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(
            os.path.join(toval_data["dirs"]["work"], "validate", sample,
                         caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError(
                "Multiple input files for validation: %s" %
                toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(
            toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(
            normalize_input_path(
                toval_data["config"]["algorithm"].get("validate_regions"),
                toval_data), toval_data)
        rm_interval_file = bedutils.clean_file(
            rm_interval_file,
            toval_data,
            bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data),
                                         data.get("genome_build"), base_dir,
                                         data)
        rm_interval_file = (naming.handle_synonyms(
            rm_interval_file, dd.get_ref_file(data), data.get("genome_build"),
            base_dir, data) if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data,
                            "rtg")
        if not vcfutils.vcf_has_variants(vrn_file):
            # RTG can fail on totally empty files. Skip these since we have nothing.
            pass
        # empty validation file, every call is a false positive
        elif not vcfutils.vcf_has_variants(rm_file):
            eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir,
                                         toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "rtg":
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file,
                                       base_dir, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "hap.py":
            data["validate"] = _run_happy_eval(vrn_file, rm_file,
                                               rm_interval_file, base_dir,
                                               toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file,
                                                    rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Пример #12
0
def priority_coverage(data, out_dir):
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(
            bed_file):
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)
        with file_transaction(out_file) as tx_out_file:
            parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
            cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "{in_bam} | {parse_cmd} > {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    return out_file
Пример #13
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(
            bed_file):
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Пример #14
0
def summary(items):
    cutoff = DEFAULT_COVERAGE_CUTOFF
    data = items[0]
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    combined_bed = bed.concat([coverage_bed, priority_bed])
    clean_bed = bedutils.clean_file(combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn
    bed_file = _uniquify_bed_names(clean_bed, out_dir, data)
    batch = _get_group_batch(items)
    assert batch, ("Did not find batch for samples: %s" %
                   ",".join([dd.get_sample_name(x) for x in items]))

    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file) and utils.file_exists(bed_file):
        with file_transaction(data, out_file) as tx_out_file:
            chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
            cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
            do.run(cmd.format(**locals()), "Prep chanjo database")
            for data in items:
                sample = dd.get_sample_name(data)
                bam_file = data["work_bam"]
                cmd = ("{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                       "{bam_file} {bed_file} | "
                       "{chanjo} --db {tx_out_file} import")
                do.run(cmd.format(**locals()), "Chanjo coverage", data)
    incomplete = incomplete_regions(out_file, batch, out_dir)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file,
                                "incomplete": incomplete}
        out.append([data])
    return out
Пример #15
0
def summary(items):
    cutoff = 4  # coverage for completeness

    out_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "coverage"))
    clean_bed = bedutils.clean_file(tz.get_in(["config", "algorithm", "coverage"], items[0]),
                                    items[0])
    bed_file = _uniquify_bed_names(clean_bed, out_dir, items[0])
    batch = _get_group_batch(items)

    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
            cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
            do.run(cmd.format(**locals()), "Prep chanjo database")
            for data in items:
                sample = dd.get_sample_name(data)
                bam_file = data["work_bam"]
                cmd = ("{chanjo} annotate -s {sample} -g {batch} -c {cutoff} {bam_file} {bed_file} | "
                       "{chanjo} --db {tx_out_file} import")
                do.run(cmd.format(**locals()), "Chanjo coverage", data)
    out = []
    for data in items:
        data["coverage"] = {"summary": out_file}
        out.append([data])
    return out
Пример #16
0
def summarize(calls, data, items):
    """Summarize results from multiple callers into a single flattened BED file.

    Approach:
      - Combine all calls found in all files
      - Filter files retaining those present with multiple levels of support.
      - Remove calls in high depth regions.
      - Remove calls with ends overlapping exclusion regions like low complexity regions.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]),
                            [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            limit_file = shared.remove_highdepth_regions(filter_file, items)
            exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f]
            exclude_file = exclude_files[0] if len(exclude_files) > 0 else None
            if exclude_file:
                noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data)
            else:
                noexclude_file = limit_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep"))
            if utils.file_exists(noexclude_file):
                calls.append({"variantcaller": "sv-ensemble",
                              "input_beds": input_beds,
                              "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)})
    return calls
Пример #17
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    import pybedtools
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda x: x is not None,
                            [_create_bed(c, out_file, data) for c in calls])
    if len(input_beds) > 0:
        size_beds = []
        for e_start, e_end in validate.EVENT_SIZES:
            base, ext = os.path.splitext(out_file)
            size_out_file = "%s-%s_%s%s" % (base, e_start, e_end, ext)
            if not utils.file_exists(size_out_file):
                with file_transaction(data, size_out_file) as tx_out_file:
                    with shared.bedtools_tmpdir(data):
                        all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                        with open(all_file, "w") as out_handle:
                            for line in fileinput.input(input_beds):
                                chrom, start, end = line.split()[:3]
                                size = int(end) - int(start)
                                if size >= e_start and size < e_end:
                                    out_handle.write(line)
                        pybedtools.BedTool(all_file).sort(stream=True)\
                          .merge(c=4, o="distinct", delim=",").saveas(tx_out_file)
            size_beds.append(size_out_file)
        out_file = bedutils.combine(size_beds, out_file, data["config"])
    if utils.file_exists(out_file):
        bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep"))
        calls.append({"variantcaller": "ensemble",
                      "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)})
    return calls
Пример #18
0
def coverage_region_detailed_stats(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return None
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        os.path.join(sample + "_cov_total.tsv")
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            pass
        else:
            with file_transaction(data, parse_file) as out_tx:
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed,
                                             depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100],
                                             max_cov=1000)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample, data=data)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data)
    return os.path.abspath(parse_file)
Пример #19
0
def coverage(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(out_dir)
    if not bed_file:
        return None
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} "
                           "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                           "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                           "chrom/chrom/' > {out_tx}")
                    do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed,  sample)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample)
    return os.path.abspath(parse_file)
Пример #20
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Пример #21
0
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file or not utils.file_exists(bed_file):
        return []
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000}

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            pass
        else:
            with file_transaction(data, parse_file) as out_tx:
                depth_thresholds = sorted(list(cutoffs | extra_cutoffs))
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs)
    return [os.path.abspath(x) for x in out_files]
Пример #22
0
def coverage(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(out_dir)
    if not bed_file:
        return None
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = (
                        "{sambamba} depth region -F \"not unmapped\" -t {cores} "
                        "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                        "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                        "chrom/chrom/' > {out_tx}")
                    do.run(
                        cmd.format(**locals()) % "-C 1000",
                        "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file),
                                            sample)
    return os.path.abspath(parse_file)
Пример #23
0
def coverage_region_detailed_stats(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return None
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            pass
        else:
            with file_transaction(parse_file) as out_tx:
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed,
                                             depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100],
                                             max_cov=1000)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample)
    return os.path.abspath(parse_file)
Пример #24
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats,
                               out_handle,
                               default_flow_style=False,
                               allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Пример #25
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = ["-T", "VariantAnnotator",
                              "-R", ref_file,
                              "-L", cleaned_bed,
                              "-I", in_bam,
                              "-A", "GCContent",
                              "-A", "Coverage",
                              "--variant", in_vcf,
                              "--out", tx_out]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >>out_handle, "CG\tdepth\tsample"
                    cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                            "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                            "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Пример #26
0
def _merge_by_batch(batch, fnames):
    """Merge all calls in a family into a single callset.
    """
    merge_dir = utils.safe_makedir(os.path.join(os.getcwd(), "merged"))
    clean_dir = utils.safe_makedir(os.path.join(merge_dir, "clean"))
    merge_file = os.path.join(merge_dir, "%s-ensemble.bed" % batch)
    if not utils.file_uptodate(merge_file, fnames[0]):
        for fname in glob.glob(os.path.join(merge_dir, "%s-ensemble*" % batch)):
            os.remove(fname)
    ensemble.combine_bed_by_size(fnames, batch, merge_dir, {}, delim="&&")
    return bedutils.clean_file(merge_file, {}, bedprep_dir=clean_dir)
Пример #27
0
def _merge_by_batch(batch, fnames):
    """Merge all calls in a family into a single callset.
    """
    merge_dir = utils.safe_makedir(os.path.join(os.getcwd(), "merged"))
    clean_dir = utils.safe_makedir(os.path.join(merge_dir, "clean"))
    merge_file = os.path.join(merge_dir, "%s-ensemble.bed" % batch)
    if not utils.file_uptodate(merge_file, fnames[0]):
        for fname in glob.glob(os.path.join(merge_dir,
                                            "%s-ensemble*" % batch)):
            os.remove(fname)
    ensemble.combine_bed_by_size(fnames, batch, merge_dir, {}, delim="&&")
    return bedutils.clean_file(merge_file, {}, bedprep_dir=clean_dir)
Пример #28
0
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, "Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        if coverage_bed:
            mini_coverage = bed.minimize(coverage_bed).fn
        if priority_bed:
            mini_priority = bed.minimize(priority_bed).fn
        if coverage_bed and priority_bed:
            combined_bed = bed.concat([mini_coverage, mini_priority]).fn
        elif coverage_bed:
            combined_bed = mini_coverage
        elif priority_bed:
            combined_bed = mini_priority
        else:  # no coverage or priority file has been set
            return items
        clean_bed = bedutils.clean_file(combined_bed, data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)

        if bed_file and utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
                cmd = "{chanjo} --db {tx_out_file} build {bed_file}"
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import"
                    )
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        if bed_file:
            os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out
Пример #29
0
def calculate(bam_file, data, sv_bed):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions,
                         vr_quantize, None, "coverage_perbase" in dd.get_tools_on(data)),
                        ("sv_regions", bedutils.clean_file(sv_bed, data, prefix="svregions-"),
                         None, None, False),
                        ("coverage", bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-"),
                         None, DEPTH_THRESHOLDS, False)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds, per_base in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds,
                                          per_base=per_base)
                for attr in ("dist", "regions", "thresholds", "per_base"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Пример #30
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    config["config"] = {}
    config["dirs"] = {"work": os.getcwd()}
    groups = organize_vcf_reps(tz.get_in(["inputs", "vcfs"], config),
                               tz.get_in(["inputs", "namere"], config),
                               config["remap"])
    groups = add_bams(tz.get_in(["inputs", "bams"], config),
                      tz.get_in(["inputs", "namere"], config), groups,
                      config["remap"])
    bed_file = bedutils.clean_file(tz.get_in(["inputs", "regions"], config),
                                   config) + ".gz"

    groups = preprocess_vcfs(groups, bed_file, config["resources"],
                             config["annotations"], config.get("filters", []))
    #pprint.pprint(groups)
    incon = {}
    for name, fnames in groups.items():
        incon[name] = find_inconsistent(name, fnames["vcf"], bed_file,
                                        config["resources"])
    incon_check, totals, counts = [], [], []
    for name, info in sorted(incon.items(),
                             key=lambda x: np.mean(x[1]["counts"]),
                             reverse=True):
        totals.extend(info["totals"])
        counts.extend(info["counts"])
        print name, info["counts"]
        if np.mean(info["counts"]) > 100:
            incon_check.extend(
                investigate_high_counts(info["summary"], info["vcf_files"]))
    totalm = np.median(totals)
    countm = np.median(counts)
    print "Overall discordants: %s-%s; %s-%s; %s / %s => %.1f%%" % (
        min(counts), max(counts), min(totals), max(totals), countm, totalm,
        countm * 100.0 / totalm)
    for to_check in incon_check:
        deconvolute_inconsistent(to_check, groups, bed_file)
    disc_bed, incon = identify_shared_discordants(incon)
    filtered_bed = merge_filtered(incon)
    # only use filtered since annotations supplied upstream now
    #ann_bed = annotate_disc_bed(disc_bed, filtered_bed, config["annotations"])
    #remain_disc = check_annotated_disc(ann_bed, incon, config["annotations"])
    ann_bed = annotate_disc_bed(disc_bed, filtered_bed, {})
    remain_disc = check_annotated_disc(ann_bed, incon, {})
    summarize_remaining_disc(incon)
    if len(remain_disc) < 10:
        identify_discordant_reasons(remain_disc, incon)

    calculate_annotation_overlap(bed_file, filtered_bed, config["annotations"])
Пример #31
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    total_reads = sambamba.number_of_reads(data, bam_file)
    out['Total_reads'] = total_reads
    mapped = sambamba.number_of_mapped_reads(data, bam_file)
    out['Mapped_reads'] = mapped
    if total_reads:
        out['Mapped_reads_pct'] = 100.0 * mapped / total_reads
    if mapped:
        mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False)
        out['Mapped_unique_reads'] = mapped
        mapped_dups = mapped - mapped_unique
        out['Duplicates'] = mapped_dups
        out['Duplicates_pct'] = 100.0 * mapped_dups / mapped

        if dd.get_coverage(data):
            cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
            merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
            target_name = "coverage"
        else:
            merged_bed_file = dd.get_variant_regions_merged(data)
            target_name = "variant_regions"

        ontarget = sambamba.number_mapped_reads_on_target(
            data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique
            padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data)
            ontarget_padded = sambamba.number_mapped_reads_on_target(
                data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded")
            out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

        avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name)
        out['Avg_coverage'] = avg_coverage

    priority = cov.priority_coverage(data, out_dir)
    cov.priority_total_coverage(data, out_dir)
    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir)
    # Re-enable with annotations from internally installed
    # problem region directory
    # if priority:
    #    annotated = cov.decorate_problem_regions(priority, problem_regions)

    return out
Пример #32
0
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller,
                         detail_dir, data):
    """Validate a caller VCF against truth within callable regions, returning stratified stats
    """
    stats = _calculate_comparison_stats(truth_vcf)
    callable_regions = bedutils.clean_file(callable_regions, data)
    callable_bed = pybedtools.BedTool(callable_regions).merge(
        d=stats["merge_size"]).saveas().fn

    match_calls = set([])
    truth_stats = {"tp": [], "fn": [], "fp": []}
    detail_handles = {}
    for stat in ["tp", "tp-baseline", "fn", "fp"]:
        detail_handles[stat] = open(os.path.join(detail_dir, "%s.vcf" % stat),
                                    "w")
    calls_by_region = {}
    call_vcf = slim_vcf(call_vcf, data)
    for call in _callable_intersect(call_vcf, callable_bed, data):
        calls_by_region[tuple(call[-3:])] = call

    truth = None
    regions = []
    for parts in _callable_intersect(truth_vcf, callable_bed, data):
        cur_region = tuple(parts[-3:])
        cur_truth = parts
        if truth is None:
            truth = cur_truth
        if _get_key(cur_truth) == _get_key(truth):
            regions.append(cur_region)
        else:
            match_calls, truth_stats = _check_call(truth, regions,
                                                   calls_by_region,
                                                   match_calls, truth_stats,
                                                   detail_handles)
            truth = cur_truth
            regions = [cur_region]
    with utils.open_gzipsafe(call_vcf) as in_handle:
        for call in (l.split("\t") for l in in_handle
                     if not l.startswith("#")):
            start, end = _get_start_end(call)
            if end:
                key = _get_key(call)
                if key not in match_calls:
                    call_info = _summarize_call(key)
                    if _event_passes(call_info, stats):
                        detail_handles["fp"].write("\t".join(call))
                        truth_stats["fp"].append(call_info)
    return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
Пример #33
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda x: x is not None,
                            [_create_bed(c, sample, work_dir, data) for c in calls])
    if len(input_beds) > 0:
        out_file = _combine_bed_by_size(input_beds, sample, work_dir, data)
        if utils.file_exists(out_file):
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep"))
            calls.append({"variantcaller": "ensemble",
                          "vrn_file": bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)})
    return calls
Пример #34
0
def _subset_to_sample(bed_file, vcf_file, data):
    """Convert the global BED file into sample specific calls.
    """
    name = dd.get_sample_name(data)
    base, ext = os.path.splitext(bed_file)
    out_file = "%s-%s%s" % (base, name, ext)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            calls = _get_sample_calls(vcf_file, name)
            with open(bed_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    for line in in_handle:
                        sample_line = _check_bed_call(line, calls)
                        if sample_line:
                            out_handle.write(sample_line)
    bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "bedprep"))
    return bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)
Пример #35
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)) and cwlutils.is_cwl_run(utils.to_single_data(data[0])):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    toval_data = cwlutils.unpack_tarballs(toval_data, toval_data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"),
                                                        toval_data),
                                   toval_data)
        rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data, prefix="validateregions-",
                                               bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data), data.get("genome_build"),
                                         base_dir, data)
        rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(toval_data),
                                                   data.get("genome_build"), base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg")
        # RTG can fail on totally empty files. Call everything in truth set as false negatives
        if not vcfutils.vcf_has_variants(vrn_file):
            eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir, toval_data, "fn")
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        # empty validation file, every call is a false positive
        elif not vcfutils.vcf_has_variants(rm_file):
            eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir, toval_data, "fp")
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        elif vmethod in ["rtg", "rtg-squash-ploidy"]:
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data, vmethod)
            eval_files = _annotate_validations(eval_files, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        elif vmethod == "hap.py":
            data["validate"] = _run_happy_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Пример #36
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()
    if dd.get_coverage(data):
        bed_file = bedutils.merge_overlaps(dd.get_coverage(data), data)
        target_name = "coverage"
    elif dd.get_variant_regions_merged(data):
        bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        bed_file = None
        target_name = "wgs"

    bed_file = clean_file(bed_file, data, prefix="cov-", simple=True)
    offtarget_stats_file = calculate_offtarget_stats(bam_file, data, bed_file,
                                                     target_name)
    if offtarget_stats_file and utils.file_exists(offtarget_stats_file):
        with open(offtarget_stats_file) as in_handle:
            stats = yaml.safe_load(in_handle)
        offtarget = stats.get('offtarget')
        mapped_unique = stats['mapped_unique']
        if offtarget and mapped_unique:
            out['offtarget_rate'] = 1.0 * offtarget / mapped_unique
        mapped = stats['mapped']
        if mapped:
            out['Duplicates'] = mapped - mapped_unique
            out['Duplicates_pct'] = 1.0 * (mapped - mapped_unique) / mapped
        total_reads = stats['total_reads']
        if total_reads:
            out['usable_rate'] = 1.0 * (mapped_unique -
                                        offtarget) / total_reads

    avg_coverage = get_average_coverage(data, bam_file, bed_file, target_name)
    out['avg_coverage'] = avg_coverage

    priority = cov.priority_coverage(data, out_dir)
    cov.priority_total_coverage(data, out_dir)
    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir)
    # Re-enable with annotations from internally installed
    # problem region directory
    # if priority:
    #    annotated = cov.decorate_problem_regions(priority, problem_regions)

    return out
Пример #37
0
def get_base_cnv_regions(data, work_dir, genome_default="transcripts1e4", include_gene_names=True):
    """Retrieve set of target regions for CNV analysis.

    Subsets to extended transcript regions for WGS experiments to avoid
    long runtimes.
    """
    cov_interval = dd.get_coverage_interval(data)
    base_regions = get_sv_bed(data, include_gene_names=include_gene_names)
    # if we don't have a configured BED or regions to use for SV caling
    if not base_regions:
        # For genome calls, subset to regions near genes as targets
        if cov_interval == "genome":
            base_regions = get_sv_bed(data, genome_default, work_dir, include_gene_names=include_gene_names)
            if base_regions:
                base_regions = remove_exclude_regions(base_regions, base_regions, [data])
        # Finally, default to the defined variant regions
        if not base_regions:
            base_regions = dd.get_variant_regions(data)
    return bedutils.clean_file(base_regions, data)
Пример #38
0
def _calculate_sv_coverage_gatk(data, work_dir):
    """Calculate coverage in defined regions using GATK tools

    TODO: This does double calculations to get GATK4 compatible HDF read counts
    and then depth and gene annotations. Both are needed for creating heterogeneity inputs.
    Ideally replace with a single mosdepth coverage calculation, and creat GATK4 TSV format:

    CONTIG  START   END     COUNT
    chrM    1       1000    13268
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate
    # GATK compatible
    target_file = gatkcnv.collect_read_counts(data, work_dir)
    # heterogeneity compatible
    target_in = bedutils.clean_file(tz.get_in(["regions", "bins", "target"], data), data, bedprep_dir=work_dir)
    target_cov = coverage.run_mosdepth(data, "target-gatk", target_in)
    target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
    return target_file, target_cov_genes
Пример #39
0
def _calculate_sv_coverage_gatk(data, work_dir):
    """Calculate coverage in defined regions using GATK tools

    TODO: This does double calculations to get GATK4 compatible HDF read counts
    and then depth and gene annotations. Both are needed for creating heterogeneity inputs.
    Ideally replace with a single mosdepth coverage calculation, and creat GATK4 TSV format:

    CONTIG  START   END     COUNT
    chrM    1       1000    13268
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate
    # GATK compatible
    target_file = gatkcnv.collect_read_counts(data, work_dir)
    # heterogeneity compatible
    target_in = bedutils.clean_file(tz.get_in(["regions", "bins", "target"], data), data, bedprep_dir=work_dir)
    target_cov = coverage.run_mosdepth(data, "target-gatk", target_in)
    target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
    return target_file, target_cov_genes
Пример #40
0
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller,
                         data):
    """Validate a caller VCF against truth within callable regions, returning stratified stats
    """
    stats = _calculate_comparison_stats(truth_vcf)
    callable_regions = bedutils.clean_file(callable_regions, data)
    callable_bed = pybedtools.BedTool(callable_regions).merge(
        d=stats["merge_size"]).saveas().fn

    match_calls = set([])
    truth_stats = {"tp": [], "fn": [], "fp": []}
    calls_by_region = {}
    call_vcf = slim_vcf(call_vcf, data)
    for call in _callable_intersect(call_vcf, callable_bed, data):
        key = tuple(call[:5] + call[7:8])
        calls_by_region[tuple(call[-3:])] = key

    truth = None
    regions = []
    for parts in _callable_intersect(truth_vcf, callable_bed, data):
        cur_region = tuple(parts[-3:])
        cur_truth = tuple(parts[:5] + parts[7:8])
        if truth is None:
            truth = cur_truth
        if cur_truth == truth:
            regions.append(cur_region)
        else:
            match_calls, truth_stats = _check_call(truth, regions,
                                                   calls_by_region,
                                                   match_calls, truth_stats)
            truth = cur_truth
            regions = [cur_region]
    with utils.open_gzipsafe(call_vcf) as in_handle:
        for call in (l.split("\t") for l in in_handle
                     if not l.startswith("#")):
            start, end = _get_start_end(call)
            if end:
                key = tuple(call[:5] + call[7:8])
                if key not in match_calls:
                    call_info = _summarize_call(key)
                    if _event_passes(call_info, stats):
                        truth_stats["fp"].append(call_info)
    return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
Пример #41
0
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller, detail_dir, data):
    """Validate a caller VCF against truth within callable regions, returning stratified stats
    """
    stats = _calculate_comparison_stats(truth_vcf)
    callable_regions = bedutils.clean_file(callable_regions, data)
    callable_bed = pybedtools.BedTool(callable_regions).merge(d=stats["merge_size"]).saveas().fn

    match_calls = set([])
    truth_stats = {"tp": [], "fn": [], "fp": []}
    detail_handles = {}
    for stat in ["tp", "tp-baseline", "fn", "fp"]:
        detail_handles[stat] = open(os.path.join(detail_dir, "%s.vcf" % stat), "w")
    calls_by_region = {}
    call_vcf = slim_vcf(call_vcf, data)
    for call in _callable_intersect(call_vcf, callable_bed, data):
        calls_by_region[tuple(call[-3:])] = call

    truth = None
    regions = []
    for parts in _callable_intersect(truth_vcf, callable_bed, data):
        cur_region = tuple(parts[-3:])
        cur_truth = parts
        if truth is None:
            truth = cur_truth
        if _get_key(cur_truth) == _get_key(truth):
            regions.append(cur_region)
        else:
            match_calls, truth_stats = _check_call(truth, regions, calls_by_region, match_calls, truth_stats,
                                                   detail_handles)
            truth = cur_truth
            regions = [cur_region]
    with utils.open_gzipsafe(call_vcf) as in_handle:
            for call in (l.split("\t") for l in in_handle if not l.startswith("#")):
                start, end = _get_start_end(call)
                if end:
                    key = _get_key(call)
                    if key not in match_calls:
                        call_info = _summarize_call(key)
                        if _event_passes(call_info, stats):
                            detail_handles["fp"].write("\t".join(call))
                            truth_stats["fp"].append(call_info)
    return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
Пример #42
0
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, ("Did not find batch for samples: %s" %
                   ",".join([dd.get_sample_name(x) for x in items]))
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        combined_bed = bed.concat([coverage_bed, priority_bed])
        clean_bed = bedutils.clean_file(
            combined_bed.fn,
            data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)
        if utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable),
                                      "chanjo")
                cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import")
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out
Пример #43
0
def _subset_to_sample(bed_file, data):
    """Convert the global BED file into sample specific calls.
    """
    name = dd.get_sample_name(data)
    base, ext = os.path.splitext(bed_file)
    out_file = "%s-%s%s" % (base, name, ext)
    if not utils.file_uptodate(out_file, bed_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(bed_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    for line in in_handle:
                        sample_line = _check_bed_call(line, name)
                        if sample_line:
                            out_handle.write(sample_line)
    if utils.file_exists(out_file):
        bedprep_dir = utils.safe_makedir(
            os.path.join(os.path.dirname(out_file), "bedprep"))
        return bedutils.clean_file(out_file, data, bedprep_dir=bedprep_dir)
    else:
        return out_file
Пример #44
0
def priority_coverage(data, out_dir):
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed)
        parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
        cmdl += " | {parse_cmd} > {tx_out_file}"
        message = "Calculating base coverage of {bed_file} in {in_bam}"
        do.run(cmdl.format(**locals()), message.format(**locals()))
    return out_file
Пример #45
0
def priority_coverage(data, out_dir):
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam):
        return out_file
    with file_transaction(data, out_file) as tx_out_file:
        cmdl = sambamba.make_command(data, "depth base", in_bam, cleaned_bed)
        parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
        cmdl += " | {parse_cmd} > {tx_out_file}"
        message = "Calculating base coverage of {bed_file} in {in_bam}"
        do.run(cmdl.format(**locals()), message.format(**locals()))
    return out_file
Пример #46
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return {}
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-")
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(out_file, in_bam):
        return out_file
    cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed,
                                 depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    with file_transaction(out_file) as tx_out_file:
        message = "Calculating region coverage of {bed_file} in {in_bam}"
        do.run(cmdl + " -o " + tx_out_file, message.format(**locals()))
    logger.debug("Saved svprioritize coverage into " + out_file)
    return out_file
Пример #47
0
def _validate_caller_vcf(call_vcf, truth_vcf, callable_regions, svcaller, data):
    """Validate a caller VCF against truth within callable regions, returning stratified stats
    """
    stats = _calculate_comparison_stats(truth_vcf)
    callable_regions = bedutils.clean_file(callable_regions, data)
    callable_bed = pybedtools.BedTool(callable_regions).merge(d=stats["merge_size"]).saveas().fn

    match_calls = set([])
    truth_stats = {"tp": [], "fn": [], "fp": []}
    calls_by_region = {}
    call_vcf = slim_vcf(call_vcf, data)
    for call in _callable_intersect(call_vcf, callable_bed, data):
        key = tuple(call[:5] + call[7:8])
        calls_by_region[tuple(call[-3:])] = key

    truth = None
    regions = []
    for parts in _callable_intersect(truth_vcf, callable_bed, data):
        cur_region = tuple(parts[-3:])
        cur_truth = tuple(parts[:5] + parts[7:8])
        if truth is None:
            truth = cur_truth
        if cur_truth == truth:
            regions.append(cur_region)
        else:
            match_calls, truth_stats = _check_call(truth, regions, calls_by_region, match_calls, truth_stats)
            truth = cur_truth
            regions = [cur_region]
    with utils.open_gzipsafe(call_vcf) as in_handle:
        for call in (l.split("\t") for l in in_handle if not l.startswith("#")):
            start, end = _get_start_end(call)
            if end:
                key = tuple(call[:5] + call[7:8])
                if key not in match_calls:
                    call_info = _summarize_call(key)
                    if _event_passes(call_info, stats):
                        truth_stats["fp"].append(call_info)
    return _to_csv(truth_stats, stats, dd.get_sample_name(data), svcaller)
Пример #48
0
def priority_coverage(data, out_dir):
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file):
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
            cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "{in_bam} | {parse_cmd} > {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    return out_file
Пример #49
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"),
                                                        toval_data),
                                   toval_data)
        rm_interval_file = bedutils.clean_file(rm_interval_file, toval_data,
                                               bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data["genome_build"], base_dir, data)
        rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(data),
                                                   data["genome_build"], base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg")
        if not vcfutils.vcf_has_variants(vrn_file):
            # RTG can fail on totally empty files. Skip these since we have nothing.
            pass
        elif vmethod == "rtg":
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Пример #50
0
def summarize(calls, data, highdepth_beds):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda x: x is not None and utils.file_exists(x),
                            [_create_bed(c, sample, work_dir, data) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size(input_beds, sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            if len(highdepth_beds) > 0:
                limit_file = _limit_calls(filter_file, highdepth_beds, data)
            else:
                limit_file = filter_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(limit_file), "bedprep"))
            calls.append({"variantcaller": "sv-ensemble",
                          "vrn_file": bedutils.clean_file(limit_file, data, bedprep_dir=bedprep_dir)})
    return calls
Пример #51
0
def run(bam_file, data, out_dir):
    """Run coverage QC analysis
    """
    out = dict()

    out_dir = utils.safe_makedir(out_dir)
    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data),
                                              data,
                                              prefix="cov-",
                                              simple=True)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data)
    if target_name == "coverage":
        out_files = cov.coverage_region_detailed_stats(target_name,
                                                       merged_bed_file, data,
                                                       out_dir)
    else:
        out_files = []

    out['Avg_coverage'] = avg_depth

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools')
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data,
                                  samtools_stats_dir)["metrics"]

    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"])
    out['Duplicates'] = dups = int(samtools_stats["Duplicates"])

    if total_reads:
        out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if mapped:
        out['Duplicates_pct'] = 100.0 * dups / mapped

    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        mapped_unique = readstats.number_of_mapped_reads(data,
                                                         bam_file,
                                                         keep_dups=False)
    out['Mapped_unique_reads'] = mapped_unique

    if merged_bed_file:
        ontarget = readstats.number_of_mapped_reads(data,
                                                    bam_file,
                                                    keep_dups=False,
                                                    bed_file=merged_bed_file,
                                                    target_name=target_name)
        out["Ontarget_unique_reads"] = ontarget
        if mapped_unique:
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(
                    out_dir, merged_bed_file, 200, data)
                ontarget_padded = readstats.number_of_mapped_reads(
                    data,
                    bam_file,
                    keep_dups=False,
                    bed_file=padded_bed_file,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    indexcov_files = _goleft_indexcov(bam_file, data, out_dir)
    out_files += [x for x in indexcov_files if x and utils.file_exists(x)]
    out = {"metrics": out}
    if len(out_files) > 0:
        out["base"] = out_files[0]
        out["secondary"] = out_files[1:]
    return out
Пример #52
0
def _merge_target_information(samples, metrics_dir):
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(
        dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(
            original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed":
            vcr_orig,
            "size":
            sum(
                len(x) for x in pybedtools.BedTool(
                    dd.get_variant_regions_merged(data))),
            "regions":
            pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed,
                                                data,
                                                prefix="cov-",
                                                simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"]
                             for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Пример #53
0
def _merge_target_information(samples, metrics_dir):
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name": dd.get_genome_build(data),
            "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed": vcr_orig,
            "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))),
            "regions": pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Пример #54
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir)
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    if "Total_reads" not in samtools_stats:
        return
    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    if not total_reads:
        return

    if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats:
        return
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if not mapped:
        return out

    if "Duplicates" in samtools_stats:
        out['Duplicates'] = dups = int(samtools_stats["Duplicates"])
        out['Duplicates_pct'] = 100.0 * dups / int(
            samtools_stats["Mapped_reads_raw"])
    else:
        dups = 0

    if dd.get_coverage(data):
        cov_bed_file = bedutils.clean_file(dd.get_coverage(data),
                                           data,
                                           prefix="cov-",
                                           simple=True)
        merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    # Whole genome runs do not need detailed on-target calculations, use total unique mapped
    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(
            data, bam_file, keep_dups=False)

    if merged_bed_file:
        ontarget = sambamba.number_of_mapped_reads(data,
                                                   bam_file,
                                                   keep_dups=False,
                                                   bed_file=merged_bed_file,
                                                   target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(
                    merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_of_mapped_reads(
                    data,
                    bam_file,
                    keep_dups=False,
                    bed_file=padded_bed_file,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file,
                                         target_name)
    out['Avg_coverage'] = avg_depth

    region_coverage_file = cov.coverage_region_detailed_stats(
        data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))]))

    return out
Пример #55
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir)
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    if "Total_reads" not in samtools_stats:
        return
    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    if not total_reads:
        return

    if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats:
        return
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if not mapped:
        return out

    if "Duplicates" in samtools_stats:
        out['Duplicates'] = dups = int(samtools_stats["Duplicates"])
        out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"])
    else:
        dups = 0

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    # Whole genome runs do not need detailed on-target calculations, use total unique mapped
    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False)

    if merged_bed_file:
        ontarget = sambamba.number_of_mapped_reads(
            data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_of_mapped_reads(
                    data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name)
    out['Avg_coverage'] = avg_depth

    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir,
                                                              extra_cutoffs=set([max(1, int(avg_depth * 0.8))]))

    return out
Пример #56
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    total_reads = sambamba.number_of_reads(data, bam_file)
    out['Total_reads'] = total_reads
    mapped = sambamba.number_of_mapped_reads(data, bam_file)
    out['Mapped_reads'] = mapped
    if total_reads:
        out['Mapped_reads_pct'] = 100.0 * mapped / total_reads
    if mapped:
        mapped_unique = sambamba.number_of_mapped_reads(data,
                                                        bam_file,
                                                        keep_dups=False)
        out['Mapped_unique_reads'] = mapped
        mapped_dups = mapped - mapped_unique
        out['Duplicates'] = mapped_dups
        out['Duplicates_pct'] = 100.0 * mapped_dups / mapped

        if dd.get_coverage(data):
            cov_bed_file = clean_file(dd.get_coverage(data),
                                      data,
                                      prefix="cov-",
                                      simple=True)
            merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
            target_name = "coverage"
        else:
            merged_bed_file = dd.get_variant_regions_merged(data)
            target_name = "variant_regions"

        ontarget = sambamba.number_mapped_reads_on_target(
            data,
            merged_bed_file,
            bam_file,
            keep_dups=False,
            target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            padded_bed_file = bedutils.get_padded_bed_file(
                merged_bed_file, 200, data)
            ontarget_padded = sambamba.number_mapped_reads_on_target(
                data,
                padded_bed_file,
                bam_file,
                keep_dups=False,
                target_name=target_name + "_padded")
            out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

        avg_coverage = get_average_coverage(data, bam_file, merged_bed_file,
                                            target_name)
        out['Avg_coverage'] = avg_coverage

    priority = cov.priority_coverage(data, out_dir)
    cov.priority_total_coverage(data, out_dir)
    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir)
    # Re-enable with annotations from internally installed
    # problem region directory
    # if priority:
    #    annotated = cov.decorate_problem_regions(priority, problem_regions)

    return out