Exemplo n.º 1
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Exemplo n.º 2
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data),
                                             vrs or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Exemplo n.º 3
0
def _get_maxcov_downsample(data):
    """Calculate maximum coverage downsampling for whole genome samples.

    Returns None if we're not doing downsampling.
    """
    from bcbio.bam import ref
    from bcbio.ngsalign import alignprep, bwa
    from bcbio.variation import coverage
    params = {"min_coverage_for_downsampling": 10,
              "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)}
    fastq_file = data["files"][0]
    num_reads = alignprep.total_reads_from_grabix(fastq_file)
    if num_reads and params["maxcov_downsample_multiplier"] and params["maxcov_downsample_multiplier"] > 0:
        vrs = dd.get_variant_regions_merged(data)
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
            genome_cov_pct = callable_size / float(total_size)
        else:
            callable_size = total_size
            genome_cov_pct = 1.0
        if (genome_cov_pct > coverage.GENOME_COV_THRESH
              and dd.get_coverage_interval(data) in ["genome", None, False]):
            total_counts, total_sizes = 0, 0
            for count, size in bwa.fastq_size_output(fastq_file, 5000):
                total_counts += int(count)
                total_sizes += (int(size) * int(count))
            read_size = float(total_sizes) / float(total_counts)
            avg_cov = float(num_reads * read_size) / callable_size
            if avg_cov >= params["min_coverage_for_downsampling"]:
                return int(avg_cov * params["maxcov_downsample_multiplier"])
    return None
Exemplo n.º 4
0
def _regions_for_coverage(data, region, ref_file, out_file):
    """Retrieve BED file of regions we need to calculate coverage in.

    Checks for variant region specifications that do not overlap contigs
    (in which case we do not calculate coverage) and regions smaller than
    callable_min_size (in which case we assign everything as callable).
    callable_min_size avoids calculations for small chromosomes we won't
    split on later, saving computation and disk IO.
    """
    variant_regions = dd.get_variant_regions_merged(data)
    ready_region = shared.subset_variant_regions(variant_regions, region, out_file)
    custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0]
    region_size = _get_region_size(ref_file, data, region)
    if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data):
        coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE"
        custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data)
        return custom_file, False
    elif not ready_region:
        get_ref_bedtool(ref_file, data["config"]).saveas(custom_file)
        return custom_file, True
    elif os.path.isfile(ready_region):
        return ready_region, True
    elif isinstance(ready_region, (list, tuple)):
        c, s, e = ready_region
        pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file)
        return custom_file, True
    else:
        custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data)
        return custom_file, variant_regions is None
Exemplo n.º 5
0
def calculate_offtarget(bam_file, ref_file, data):
    """Generate file of offtarget read counts for inputs with variant regions.
    """
    vrs_file = dd.get_variant_regions_merged(data)
    if vrs_file:
        out_file = "%s-offtarget-stats.yaml" % os.path.splitext(bam_file)[0]
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                offtarget_regions = "%s-regions.bed" % utils.splitext_plus(
                    out_file)[0]
                ref_bed = get_ref_bedtool(ref_file, data["config"])
                ref_bed.subtract(pybedtools.BedTool(vrs_file),
                                 nonamecheck=True).saveas(offtarget_regions)
                cmd = (
                    "samtools view -u {bam_file} -L {offtarget_regions} | "
                    "bedtools intersect -abam - -b {offtarget_regions} -f 1.0 -bed | wc -l"
                )
                offtarget_count = int(
                    subprocess.check_output(cmd.format(**locals()),
                                            shell=True))
                cmd = "samtools idxstats {bam_file} | awk '{{s+=$3}} END {{print s}}'"
                mapped_count = int(
                    subprocess.check_output(cmd.format(**locals()),
                                            shell=True))
                with open(tx_out_file, "w") as out_handle:
                    yaml.safe_dump(
                        {
                            "mapped": mapped_count,
                            "offtarget": offtarget_count
                        },
                        out_handle,
                        allow_unicode=False,
                        default_flow_style=False)
        return out_file
Exemplo n.º 6
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions, vr_quantize, None),
                        ("sv_regions", regions.get_sv_bed(data), None, None),
                        ("coverage", dd.get_coverage(data), None, DEPTH_THRESHOLDS)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds)
                for attr in ("dist", "regions", "thresholds"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Exemplo n.º 7
0
def _regions_for_coverage(data, region, ref_file, out_file):
    """Retrieve BED file of regions we need to calculate coverage in.

    Checks for variant region specifications that do not overlap contigs
    (in which case we do not calculate coverage) and regions smaller than
    callable_min_size (in which case we assign everything as callable).
    callable_min_size avoids calculations for small chromosomes we won't
    split on later, saving computation and disk IO.
    """
    variant_regions = dd.get_variant_regions_merged(data)
    ready_region = shared.subset_variant_regions(variant_regions, region,
                                                 out_file)
    custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0]
    region_size = _get_region_size(ref_file, data, region)
    if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(
            data):
        coverage_str = "CALLABLE" if realign.has_aligned_reads(
            dd.get_work_bam(data), region) else "NO_COVERAGE"
        custom_file = _write_all_chrom_file(coverage_str, custom_file,
                                            ref_file, region, data)
        return custom_file, False
    elif not ready_region:
        get_ref_bedtool(ref_file, data["config"]).saveas(custom_file)
        return custom_file, True
    elif os.path.isfile(ready_region):
        return ready_region, True
    elif isinstance(ready_region, (list, tuple)):
        c, s, e = ready_region
        pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e),
                           from_string=True).saveas(custom_file)
        return custom_file, True
    else:
        custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file,
                                            ref_file, region, data)
        return custom_file, variant_regions is None
Exemplo n.º 8
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)

        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        # Fixing the file name: MultiQC picks sample name from BAM file name.
        fixed_bam_fname = os.path.join(out_dir,
                                       dd.get_sample_name(data) + ".bam")
        if not os.path.islink(fixed_bam_fname):
            os.symlink(bam_file, fixed_bam_fname)

        export = utils.local_path_export()
        cmd = (
            "unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} "
            "--skip-duplicated --skip-dup-mode 0 "
            "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = None
        if tz.get_in(("genome_resources", "aliases", "human"), data, ""):
            species = "HUMAN"
        elif any(
                tz.get_in("genome_build", data, "").startswith(k)
                for k in ["mm", "GRCm"]):
            species = "MOUSE"
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(
            dd.get_coverage(data), data) or dd.get_variant_regions_merged(data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()),
               "Qualimap: %s" % dd.get_sample_name(data))

    # return _parse_qualimap_metrics(report_file, data)
    return dict()
Exemplo n.º 9
0
def _get_maxcov_downsample(data):
    """Calculate maximum coverage downsampling for whole genome samples.

    Returns None if we're not doing downsampling.
    """
    from bcbio.bam import ref
    from bcbio.ngsalign import alignprep, bwa
    from bcbio.variation import coverage
    fastq_file = data["files"][0]
    params = alignprep.get_downsample_params(data)
    if params:
        num_reads = alignprep.total_reads_from_grabix(fastq_file)
        if num_reads:
            vrs = dd.get_variant_regions_merged(data)
            total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
            if vrs:
                callable_size = pybedtools.BedTool(vrs).total_coverage()
                genome_cov_pct = callable_size / float(total_size)
            else:
                callable_size = total_size
                genome_cov_pct = 1.0
            if (genome_cov_pct > coverage.GENOME_COV_THRESH
                  and dd.get_coverage_interval(data) in ["genome", None, False]):
                total_counts, total_sizes = 0, 0
                for count, size in bwa.fastq_size_output(fastq_file, 5000):
                    total_counts += int(count)
                    total_sizes += (int(size) * int(count))
                read_size = float(total_sizes) / float(total_counts)
                avg_cov = float(num_reads * read_size) / callable_size
                if avg_cov >= params["min_coverage_for_downsampling"]:
                    return int(avg_cov * params["maxcov_downsample_multiplier"])
    return None
Exemplo n.º 10
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir)
            cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                   "--skip-duplicated --skip-dup-mode 0 "
                   "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                  or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"]
                       else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir, "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
Exemplo n.º 11
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions, vr_quantize, None),
                        ("sv_regions", bedutils.clean_file(regions.get_sv_bed(data), data), None, None),
                        ("coverage", bedutils.clean_file(dd.get_coverage(data), data), None, DEPTH_THRESHOLDS)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds)
                for attr in ("dist", "regions", "thresholds"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Exemplo n.º 12
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data),
                                             vrs or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Exemplo n.º 13
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    depth_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(callable_file, bam_file):
        cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]),
               "--processes", str(dd.get_num_cores(data)), "--ordered"]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, depth_file) as tx_depth_file:
            with utils.chdir(os.path.dirname(tx_depth_file)):
                tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_depth_file.replace(".depth.bed", "")
                bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0]
                bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data)
                cmd += ["--reference", bam_ref_file]
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
Exemplo n.º 14
0
 def _get_variant_regions(data):
     out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
     if merged:
         merged_out = dd.get_variant_regions_merged(data)
         if merged_out:
             out = merged_out
         else:
             out = merge_overlaps(out, data)
     return out
Exemplo n.º 15
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {
        "window_size": 5000,
        "parallel_window_size": 1e5,
        "min": dd.get_coverage_depth_min(data),
        "high_multiplier": 20
    }
    prefix = os.path.join(
        utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data,
                                                   bam_file,
                                                   variant_regions,
                                                   "variant_regions",
                                                   file_prefix=prefix)
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = [
            "goleft", "depth", "--windowsize",
            str(params["window_size"]), "--q", "1", "--mincov",
            str(params["min"]), "--reference", ref_file, "--processes",
            str(dd.get_num_cores(data)), "--stats", "--ordered"
        ]
        if variant_regions:
            window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(
                out_file)[0]
            if not utils.file_uptodate(window_file, bam_file):
                with file_transaction(data, window_file) as tx_out_file:
                    pybedtools.BedTool().window_maker(
                        w=params["parallel_window_size"],
                        b=pybedtools.BedTool(variant_regions)).saveas(
                            tx_out_file)
            cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed",
                                                       ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                do.run(cmd,
                       "Calculate coverage: %s" % dd.get_sample_name(data))
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(
        callable_file, data), variant_regions_avg_cov
Exemplo n.º 16
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()
    if dd.get_coverage(data):
        bed_file = bedutils.merge_overlaps(dd.get_coverage(data), data)
        target_name = "coverage"
    elif dd.get_variant_regions_merged(data):
        bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        bed_file = None
        target_name = "wgs"

    bed_file = clean_file(bed_file, data, prefix="cov-", simple=True)
    offtarget_stats_file = calculate_offtarget_stats(bam_file, data, bed_file,
                                                     target_name)
    if offtarget_stats_file and utils.file_exists(offtarget_stats_file):
        with open(offtarget_stats_file) as in_handle:
            stats = yaml.safe_load(in_handle)
        offtarget = stats.get('offtarget')
        mapped_unique = stats['mapped_unique']
        if offtarget and mapped_unique:
            out['offtarget_rate'] = 1.0 * offtarget / mapped_unique
        mapped = stats['mapped']
        if mapped:
            out['Duplicates'] = mapped - mapped_unique
            out['Duplicates_pct'] = 1.0 * (mapped - mapped_unique) / mapped
        total_reads = stats['total_reads']
        if total_reads:
            out['usable_rate'] = 1.0 * (mapped_unique -
                                        offtarget) / total_reads

    avg_coverage = get_average_coverage(data, bam_file, bed_file, target_name)
    out['avg_coverage'] = avg_coverage

    priority = cov.priority_coverage(data, out_dir)
    cov.priority_total_coverage(data, out_dir)
    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir)
    # Re-enable with annotations from internally installed
    # problem region directory
    # if priority:
    #    annotated = cov.decorate_problem_regions(priority, problem_regions)

    return out
Exemplo n.º 17
0
 def _get_variant_regions(data):
     out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
     # Only need to merge for variant region inputs, not callable BED regions which don't overlap
     if merged and dd.get_variant_regions(data):
         merged_out = dd.get_variant_regions_merged(data)
         if merged_out:
             out = merged_out
         else:
             out = merge_overlaps(out, data)
     return out
Exemplo n.º 18
0
 def _get_variant_regions(data):
     out = dd.get_variant_regions(data) or dd.get_sample_callable(data)
     # Only need to merge for variant region inputs, not callable BED regions which don't overlap
     if merged and dd.get_variant_regions(data):
         merged_out = dd.get_variant_regions_merged(data)
         if merged_out:
             out = merged_out
         else:
             out = merge_overlaps(out, data)
     return out
Exemplo n.º 19
0
def _prep_real_counts(bam_file, data, samtools_stats):
    out = {}

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        bed = dd.get_coverage_merged(data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        bed = dd.get_variant_regions_merged(data) or dd.get_sample_callable(
            data)
        target_name = "variant_regions"
    else:
        bed = None
        target_name = "genome"

    dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"),
                            True)

    if bed:
        out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage()
        out["Preseq_read_count"] = readstats.number_of_mapped_reads(
            data,
            bam_file,
            keep_dups=True,
            bed_file=bed,
            target_name=target_name)
        ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data,
                                                      bam_file)
        if dedupped:
            out["Preseq_unique_count"] = readstats.number_of_mapped_reads(
                data,
                bam_file,
                keep_dups=False,
                bed_file=bed,
                target_name=target_name)

        # Counting average on-target alignment length, based on the equation:
        #    avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size
        total_alignments = out.get(
            "Preseq_unique_count") or out["Preseq_read_count"]
        out["Preseq_read_length"] = ontrg_unique_depth * out[
            "Preseq_genome_size"] // total_alignments

    else:  # WGS
        out["Preseq_genome_size"] = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        out["Preseq_read_count"] = int(samtools_stats["Total_reads"])
        out["Preseq_read_length"] = int(samtools_stats["Average_read_length"])
        if dedupped:
            out["Preseq_unique_count"] = out["Preseq_read_count"] - int(
                samtools_stats["Duplicates"])

    return out
Exemplo n.º 20
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []):
            logger.info("Full qualimap analysis for %s may be slow." % bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)

        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        # Fixing the file name: MultiQC picks sample name from BAM file name.
        fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam")
        if not os.path.islink(fixed_bam_fname):
            os.symlink(bam_file, fixed_bam_fname)

        export = utils.local_path_export()
        cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} "
               "--skip-duplicated --skip-dup-mode 0 "
               "-nt {num_cores} --java-mem-size={max_mem} {options}")
        species = None
        if tz.get_in(("genome_resources", "aliases", "human"), data, ""):
            species = "HUMAN"
        elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]):
            species = "MOUSE"
        if species in ["HUMAN", "MOUSE"]:
            cmd += " -gd {species}"
        regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data)
        if regions:
            bed6_regions = _bed_to_bed6(regions, out_dir)
            cmd += " -gff {bed6_regions}"
        do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data))

    # return _parse_qualimap_metrics(report_file, data)
    return dict()
Exemplo n.º 21
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    total_reads = sambamba.number_of_reads(data, bam_file)
    out['Total_reads'] = total_reads
    mapped = sambamba.number_of_mapped_reads(data, bam_file)
    out['Mapped_reads'] = mapped
    if total_reads:
        out['Mapped_reads_pct'] = 100.0 * mapped / total_reads
    if mapped:
        mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False)
        out['Mapped_unique_reads'] = mapped
        mapped_dups = mapped - mapped_unique
        out['Duplicates'] = mapped_dups
        out['Duplicates_pct'] = 100.0 * mapped_dups / mapped

        if dd.get_coverage(data):
            cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
            merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
            target_name = "coverage"
        else:
            merged_bed_file = dd.get_variant_regions_merged(data)
            target_name = "variant_regions"

        ontarget = sambamba.number_mapped_reads_on_target(
            data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique
            padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data)
            ontarget_padded = sambamba.number_mapped_reads_on_target(
                data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded")
            out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

        avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name)
        out['Avg_coverage'] = avg_coverage

    priority = cov.priority_coverage(data, out_dir)
    cov.priority_total_coverage(data, out_dir)
    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir)
    # Re-enable with annotations from internally installed
    # problem region directory
    # if priority:
    #    annotated = cov.decorate_problem_regions(priority, problem_regions)

    return out
Exemplo n.º 22
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1",
               "--mincov", str(params["min"]), "--reference", ref_file,
               "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"]
        window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0]
        if not utils.file_uptodate(window_file, bam_file):
            with file_transaction(data, window_file) as tx_out_file:
                if not variant_regions:
                    variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(variant_regions, "w") as out_handle:
                        for c in shared.get_noalt_contigs(data):
                            out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
                pybedtools.BedTool().window_maker(w=params["parallel_window_size"],
                                                  b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file)
        cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
Exemplo n.º 23
0
def calculate_offtarget(bam_file, ref_file, data):
    """Generate file of offtarget read counts for inputs with variant regions.
    """
    vrs_file = dd.get_variant_regions_merged(data)
    if vrs_file:
        out_file = "%s-offtarget-stats.yaml" % os.path.splitext(bam_file)[0]
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                offtarget_regions = "%s-regions.bed" % utils.splitext_plus(out_file)[0]
                ref_bed = get_ref_bedtool(ref_file, data["config"])
                ref_bed.subtract(pybedtools.BedTool(vrs_file), nonamecheck=True).saveas(offtarget_regions)
                cmd = ("samtools view -u {bam_file} -L {offtarget_regions} | "
                       "bedtools intersect -abam - -b {offtarget_regions} -f 1.0 -bed | wc -l")
                offtarget_count = int(subprocess.check_output(cmd.format(**locals()), shell=True))
                cmd = "samtools idxstats {bam_file} | awk '{{s+=$3}} END {{print s}}'"
                mapped_count = int(subprocess.check_output(cmd.format(**locals()), shell=True))
                with open(tx_out_file, "w") as out_handle:
                    yaml.safe_dump({"mapped": mapped_count, "offtarget": offtarget_count}, out_handle,
                                   allow_unicode=False, default_flow_style=False)
        return out_file
Exemplo n.º 24
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = utils.to_single_data(data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.callable, bam_file_ready, ref_file, data)
        vrs_file = dd.get_variant_regions_merged(data)
        offtarget_stats = callable.calculate_offtarget_stats(
            bam_file_ready, data, vrs_file, "variant_regions")
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "highdepth": covinfo.highdepth,
            "sample_callable": covinfo.callable,
            "coverage_bed": covinfo.coverage,
            "avg_coverage": covinfo.avg_coverage,
            "offtarget_stats": offtarget_stats
        }
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Exemplo n.º 25
0
def _prep_real_counts(bam_file, data, samtools_stats):
    out = {}

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        bed = dd.get_coverage_merged(data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        bed = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        bed = None
        target_name = "genome"

    dedupped = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True)

    if bed:
        out["Preseq_genome_size"] = pybedtools.BedTool(bed).total_coverage()
        out["Preseq_read_count"] = readstats.number_of_mapped_reads(
            data, bam_file, keep_dups=True, bed_file=bed, target_name=target_name)
        ontrg_unique_depth = cov.get_average_coverage(target_name, bed, data, bam_file)
        if dedupped:
            out["Preseq_unique_count"] = readstats.number_of_mapped_reads(
                data, bam_file, keep_dups=False, bed_file=bed, target_name=target_name)

        # Counting average on-target alignment length, based on the equation:
        #    avg depth ~~ num (unique) on-target alignments * avg on-target aln length / target size
        total_alignments = out.get("Preseq_unique_count") or out["Preseq_read_count"]
        out["Preseq_read_length"] = ontrg_unique_depth * out["Preseq_genome_size"] // total_alignments

    else:  # WGS
        out["Preseq_genome_size"] = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        out["Preseq_read_count"] = int(samtools_stats["Total_reads"])
        out["Preseq_read_length"] = int(samtools_stats["Average_read_length"])
        if dedupped:
            out["Preseq_unique_count"] = out["Preseq_read_count"] - int(samtools_stats["Duplicates"])

    return out
Exemplo n.º 26
0
def run(bam_file, data, out_dir):
    """Run coverage QC analysis
    """
    out = dict()

    out_dir = utils.safe_makedir(out_dir)
    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        merged_bed_file = bedutils.clean_file(dd.get_coverage_merged(data),
                                              data,
                                              prefix="cov-",
                                              simple=True)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    avg_depth = cov.get_average_coverage(target_name, merged_bed_file, data)
    if target_name == "coverage":
        out_files = cov.coverage_region_detailed_stats(target_name,
                                                       merged_bed_file, data,
                                                       out_dir)
    else:
        out_files = []

    out['Avg_coverage'] = avg_depth

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools')
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data,
                                  samtools_stats_dir)["metrics"]

    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"])
    out['Duplicates'] = dups = int(samtools_stats["Duplicates"])

    if total_reads:
        out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if mapped:
        out['Duplicates_pct'] = 100.0 * dups / mapped

    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        mapped_unique = readstats.number_of_mapped_reads(data,
                                                         bam_file,
                                                         keep_dups=False)
    out['Mapped_unique_reads'] = mapped_unique

    if merged_bed_file:
        ontarget = readstats.number_of_mapped_reads(data,
                                                    bam_file,
                                                    keep_dups=False,
                                                    bed_file=merged_bed_file,
                                                    target_name=target_name)
        out["Ontarget_unique_reads"] = ontarget
        if mapped_unique:
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(
                    out_dir, merged_bed_file, 200, data)
                ontarget_padded = readstats.number_of_mapped_reads(
                    data,
                    bam_file,
                    keep_dups=False,
                    bed_file=padded_bed_file,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    indexcov_files = _goleft_indexcov(bam_file, data, out_dir)
    out_files += [x for x in indexcov_files if x and utils.file_exists(x)]
    out = {"metrics": out}
    if len(out_files) > 0:
        out["base"] = out_files[0]
        out["secondary"] = out_files[1:]
    return out
Exemplo n.º 27
0
def _merge_target_information(samples, metrics_dir):
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(
        dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(
            original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed":
            vcr_orig,
            "size":
            sum(
                len(x) for x in pybedtools.BedTool(
                    dd.get_variant_regions_merged(data))),
            "regions":
            pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed,
                                                data,
                                                prefix="cov-",
                                                simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"]
                             for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Exemplo n.º 28
0
def _merge_target_information(samples, metrics_dir):
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name": dd.get_genome_build(data),
            "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed": vcr_orig,
            "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))),
            "regions": pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Exemplo n.º 29
0
def _merge_target_information(samples):
    out_file = os.path.join("metrics", "target_info.yaml")
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    variant_regions = set(dd.get_variant_regions(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the sample across samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the sample across samples
    vcr = None
    if len(variant_regions) == 1:
        vcr = dd.get_variant_regions_orig(data)
        vcr_merged = dd.get_variant_regions_merged(data)
        vcr_ann = annotate.add_genes(vcr, data)
        info["variants_regions_info"] = {
            "bed":
            variant_regions,
            "size":
            sum(len(x) for x in pybedtools.BedTool(vcr_merged)),
            "regions":
            pybedtools.BedTool(vcr).count(),
            "genes":
            len(
                list(
                    set(r.name for r in pybedtools.BedTool(vcr_ann)
                        if r.name and r.name != "."))),
        }
    elif len(variant_regions) == 0:
        info["variants_regions_info"] = {"bed": None}

    # Reporting in MultiQC only if the target is the sample across samples
    if len(coverage_beds) == 1:
        bed = dd.get_coverage(data)
        if vcr and vcr == bed:
            info["coverage_bed_info"] = info["variants_regions_info"]
        elif bed:
            ann_bed = annotate.add_genes(bed, data)
            info["coverage_bed_info"] = {
                "bed":
                bed,
                "size":
                pybedtools.BedTool(bed).total_coverage(),
                "regions":
                pybedtools.BedTool(bed).count(),
                "genes":
                len(
                    list(
                        set(r.name for r in pybedtools.BedTool(ann_bed)
                            if r.name and r.name != "."))),
            }

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Exemplo n.º 30
0
def run(bam_file, data, out_dir):
    """Run coverage QC analysis
    """
    out = dict()

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        merged_bed_file = dd.get_coverage_merged(data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name)
    out['Avg_coverage'] = avg_depth

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, 'samtools')
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_paired_reads"] = int(samtools_stats["Mapped_paired_reads"])
    out['Duplicates'] = dups = int(samtools_stats["Duplicates"])

    if total_reads:
        out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if mapped:
        out['Duplicates_pct'] = 100.0 * dups / mapped

    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False)
    out['Mapped_unique_reads'] = mapped_unique

    if merged_bed_file:
        ontarget = sambamba.number_of_mapped_reads(
            data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name)
        out["Ontarget_unique_reads"] = ontarget
        if mapped_unique:
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(out_dir, merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_of_mapped_reads(
                    data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    out_files = cov.coverage_region_detailed_stats(data, out_dir,
                                                   extra_cutoffs=set([max(1, int(avg_depth * 0.8))]))
    for ext in ["coverage.bed", "summary.bed"]:
        out_files += [x for x in glob.glob(os.path.join(out_dir, "*%s" % ext)) if os.path.isfile(x)]
    indexcov_files = _goleft_indexcov(bam_file, data, out_dir)
    out_files += [x for x in indexcov_files if x and utils.file_exists(x)]
    out = {"metrics": out}
    if len(out_files) > 0:
        out["base"] = out_files[0]
        out["secondary"] = out_files[1:]
    return out
Exemplo n.º 31
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir)
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    if "Total_reads" not in samtools_stats:
        return
    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    if not total_reads:
        return

    if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats:
        return
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if not mapped:
        return out

    if "Duplicates" in samtools_stats:
        out['Duplicates'] = dups = int(samtools_stats["Duplicates"])
        out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"])
    else:
        dups = 0

    if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]:
        cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    # Whole genome runs do not need detailed on-target calculations, use total unique mapped
    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False)

    if merged_bed_file:
        ontarget = sambamba.number_of_mapped_reads(
            data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_of_mapped_reads(
                    data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name)
    out['Avg_coverage'] = avg_depth

    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir,
                                                              extra_cutoffs=set([max(1, int(avg_depth * 0.8))]))

    return out
Exemplo n.º 32
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir)
    from bcbio.qc import samtools
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    if "Total_reads" not in samtools_stats:
        return
    out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"])
    if not total_reads:
        return

    if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats:
        return
    out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"])
    out["Mapped_reads_pct"] = 100.0 * mapped / total_reads
    if not mapped:
        return out

    if "Duplicates" in samtools_stats:
        out['Duplicates'] = dups = int(samtools_stats["Duplicates"])
        out['Duplicates_pct'] = 100.0 * dups / int(
            samtools_stats["Mapped_reads_raw"])
    else:
        dups = 0

    if dd.get_coverage(data):
        cov_bed_file = bedutils.clean_file(dd.get_coverage(data),
                                           data,
                                           prefix="cov-",
                                           simple=True)
        merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
        target_name = "coverage"
    elif dd.get_coverage_interval(data) != "genome":
        merged_bed_file = dd.get_variant_regions_merged(data)
        target_name = "variant_regions"
    else:
        merged_bed_file = None
        target_name = "genome"

    # Whole genome runs do not need detailed on-target calculations, use total unique mapped
    if dd.get_coverage_interval(data) == "genome":
        mapped_unique = mapped - dups
    else:
        out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(
            data, bam_file, keep_dups=False)

    if merged_bed_file:
        ontarget = sambamba.number_of_mapped_reads(data,
                                                   bam_file,
                                                   keep_dups=False,
                                                   bed_file=merged_bed_file,
                                                   target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            if dd.get_coverage_interval(data) != "genome":
                # Skip padded calculation for WGS even if the "coverage" file is specified
                # the padded statistic makes only sense for exomes and panels
                padded_bed_file = bedutils.get_padded_bed_file(
                    merged_bed_file, 200, data)
                ontarget_padded = sambamba.number_of_mapped_reads(
                    data,
                    bam_file,
                    keep_dups=False,
                    bed_file=padded_bed_file,
                    target_name=target_name + "_padded")
                out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

    avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file,
                                         target_name)
    out['Avg_coverage'] = avg_depth

    region_coverage_file = cov.coverage_region_detailed_stats(
        data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))]))

    return out
Exemplo n.º 33
0
def _run_coverage_qc(bam_file, data, out_dir):
    """Run coverage QC analysis"""
    out = dict()

    total_reads = sambamba.number_of_reads(data, bam_file)
    out['Total_reads'] = total_reads
    mapped = sambamba.number_of_mapped_reads(data, bam_file)
    out['Mapped_reads'] = mapped
    if total_reads:
        out['Mapped_reads_pct'] = 100.0 * mapped / total_reads
    if mapped:
        mapped_unique = sambamba.number_of_mapped_reads(data,
                                                        bam_file,
                                                        keep_dups=False)
        out['Mapped_unique_reads'] = mapped
        mapped_dups = mapped - mapped_unique
        out['Duplicates'] = mapped_dups
        out['Duplicates_pct'] = 100.0 * mapped_dups / mapped

        if dd.get_coverage(data):
            cov_bed_file = clean_file(dd.get_coverage(data),
                                      data,
                                      prefix="cov-",
                                      simple=True)
            merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data)
            target_name = "coverage"
        else:
            merged_bed_file = dd.get_variant_regions_merged(data)
            target_name = "variant_regions"

        ontarget = sambamba.number_mapped_reads_on_target(
            data,
            merged_bed_file,
            bam_file,
            keep_dups=False,
            target_name=target_name)
        if mapped_unique:
            out["Ontarget_unique_reads"] = ontarget
            out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique
            out['Offtarget_pct'] = 100.0 * (mapped_unique -
                                            ontarget) / mapped_unique
            padded_bed_file = bedutils.get_padded_bed_file(
                merged_bed_file, 200, data)
            ontarget_padded = sambamba.number_mapped_reads_on_target(
                data,
                padded_bed_file,
                bam_file,
                keep_dups=False,
                target_name=target_name + "_padded")
            out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique
        if total_reads:
            out['Usable_pct'] = 100.0 * ontarget / total_reads

        avg_coverage = get_average_coverage(data, bam_file, merged_bed_file,
                                            target_name)
        out['Avg_coverage'] = avg_coverage

    priority = cov.priority_coverage(data, out_dir)
    cov.priority_total_coverage(data, out_dir)
    region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir)
    # Re-enable with annotations from internally installed
    # problem region directory
    # if priority:
    #    annotated = cov.decorate_problem_regions(priority, problem_regions)

    return out