示例#1
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
def priority_total_coverage(data):
    """
    calculate coverage at depth 20 in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data

    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with file_transaction(out_file) as tx_out_file:
        cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} "
               "-F \"not unmapped\" "
               "-T 20 {in_bam} -o {tx_out_file}")
        message = "Calculating coverage of {bed_file} regions in {in_bam}"
        do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
def priority_total_coverage(data):
    """
    calculate coverage at depth 20 in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data

    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with file_transaction(out_file) as tx_out_file:
        cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} "
               "-F \"not unmapped\" "
               "-T 20 {in_bam} -o {tx_out_file}")
        message = "Calculating coverage of {bed_file} regions in {in_bam}"
        do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
示例#4
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
def summary(items):
    cutoff = DEFAULT_COVERAGE_CUTOFF
    data = items[0]
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    combined_bed = bed.concat([coverage_bed, priority_bed])
    clean_bed = bedutils.clean_file(combined_bed.fn, data) if len(combined_bed) > 0 else combined_bed.fn
    bed_file = _uniquify_bed_names(clean_bed, out_dir, data)
    batch = _get_group_batch(items)
    assert batch, ("Did not find batch for samples: %s" %
                   ",".join([dd.get_sample_name(x) for x in items]))

    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file) and utils.file_exists(bed_file):
        with file_transaction(data, out_file) as tx_out_file:
            chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
            cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
            do.run(cmd.format(**locals()), "Prep chanjo database")
            for data in items:
                sample = dd.get_sample_name(data)
                bam_file = data["work_bam"]
                cmd = ("{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                       "{bam_file} {bed_file} | "
                       "{chanjo} --db {tx_out_file} import")
                do.run(cmd.format(**locals()), "Chanjo coverage", data)
    incomplete = incomplete_regions(out_file, batch, out_dir)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file,
                                "incomplete": incomplete}
        out.append([data])
    return out
示例#6
0
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, "Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        if coverage_bed:
            mini_coverage = bed.minimize(coverage_bed).fn
        if priority_bed:
            mini_priority = bed.minimize(priority_bed).fn
        if coverage_bed and priority_bed:
            combined_bed = bed.concat([mini_coverage, mini_priority]).fn
        elif coverage_bed:
            combined_bed = mini_coverage
        elif priority_bed:
            combined_bed = mini_priority
        else:  # no coverage or priority file has been set
            return items
        clean_bed = bedutils.clean_file(combined_bed, data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)

        if bed_file and utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
                cmd = "{chanjo} --db {tx_out_file} build {bed_file}"
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import"
                    )
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        if bed_file:
            os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out
示例#7
0
def _add_scatter_plot(out, data):
    out_file = "%s-scatter.pdf" % os.path.splitext(out["cnr"])[0]
    priority_regions = dd.get_priority_regions(data)
    if not priority_regions:
        return None
    priority_bed = plot._prioritize_plot_regions(pybedtools.BedTool(priority_regions), data)
    if utils.file_exists(out_file):
        return out_file
    cnr = _remove_haplotype_chroms(out["cnr"], data)
    cns = _remove_haplotype_chroms(out["cns"], data)
    with file_transaction(data, out_file) as tx_out_file:
        cmd = [_get_cmd(), "scatter", "-s", cns, "-o", tx_out_file, "-l", priority_bed, cnr]
        do.run(cmd, "CNVkit scatter plot")
    return out_file
示例#8
0
def _add_scatter_plot(out, data):
    out_file = "%s-scatter.pdf" % os.path.splitext(out["cnr"])[0]
    priority_regions = dd.get_priority_regions(data)
    if not priority_regions:
        return None
    priority_bed = plot._prioritize_plot_regions(
        pybedtools.BedTool(priority_regions), data)
    if utils.file_exists(out_file):
        return out_file
    cnr = _remove_haplotype_chroms(out["cnr"], data)
    cns = _remove_haplotype_chroms(out["cns"], data)
    with file_transaction(data, out_file) as tx_out_file:
        cmd = [
            _get_cmd(), "scatter", "-s", cns, "-o", tx_out_file, "-l",
            priority_bed, cnr
        ]
        do.run(cmd, "CNVkit scatter plot")
    return out_file
def priority_coverage(data):
    AVERAGE_REGION_STRING_LENGTH = 100
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    sample = dd.get_sample_name(data)
    out_file = os.path.join(sample + "_priority_depth.bed")
    if file_exists(out_file):
        data['priority_coverage'] = os.path.abspath(out_file)
        return data
    with chdir(work_dir):
        in_bam = data['work_bam']
        logger.debug("Calculating priority coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        with file_transaction(out_file) as tx_out_file:
            lcount = 0
            for chunk in robust_partition_all(batch_size, region_bed):
                coord_batch = []
                line_batch = ""
                for line in chunk:
                    lcount += 1
                    chrom = line.chrom
                    start = max(line.start, 0)
                    end = line.end
                    coords = "%s:%s-%s" % (chrom, start, end)
                    coord_batch.append(coords)
                    line_batch += str(line)
                if not coord_batch:
                    continue
                region_file = pybedtools.BedTool(line_batch,
                                                 from_string=True).saveas().fn
                coord_string = " ".join(coord_batch)
                awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample
                cmd = ("samtools view -b {in_bam} {coord_string} | "
                       "bedtools coverage -d -a {region_file} -b - | "
                       "awk {awk_string} >> {tx_out_file}")
                _silence_run(cmd.format(**locals()))
        data['priority_coverage'] = os.path.abspath(out_file)
    return data
def priority_coverage(data):
    AVERAGE_REGION_STRING_LENGTH = 100
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    sample = dd.get_sample_name(data)
    out_file = os.path.join(sample + "_priority_depth.bed")
    if file_exists(out_file):
        data['priority_coverage'] = os.path.abspath(out_file)
        return data
    with chdir(work_dir):
        in_bam = data['work_bam']
        logger.debug("Calculating priority coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        with file_transaction(out_file) as tx_out_file:
            lcount = 0
            for chunk in robust_partition_all(batch_size, region_bed):
                coord_batch = []
                line_batch = ""
                for line in chunk:
                    lcount += 1
                    chrom = line.chrom
                    start = max(line.start, 0)
                    end = line.end
                    coords = "%s:%s-%s" % (chrom, start, end)
                    coord_batch.append(coords)
                    line_batch += str(line)
                if not coord_batch:
                    continue
                region_file = pybedtools.BedTool(line_batch,
                                                from_string=True).saveas().fn
                coord_string = " ".join(coord_batch)
                awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample
                cmd = ("samtools view -b {in_bam} {coord_string} | "
                        "bedtools coverage -d -a {region_file} -b - | "
                        "awk {awk_string} >> {tx_out_file}")
                _silence_run(cmd.format(**locals()))
        data['priority_coverage'] = os.path.abspath(out_file)
    return data
示例#11
0
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, ("Did not find batch for samples: %s" %
                   ",".join([dd.get_sample_name(x) for x in items]))
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        combined_bed = bed.concat([coverage_bed, priority_bed])
        clean_bed = bedutils.clean_file(
            combined_bed.fn,
            data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)
        if utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable),
                                      "chanjo")
                cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import")
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out
示例#12
0
def _needs_coverage(data):
    return dd.get_coverage_regions(data) or dd.get_priority_regions(data)
示例#13
0
def _needs_coverage(data):
    return dd.get_coverage_regions(data) or dd.get_priority_regions(data)