def run_dragen(args):
    to_run = []
    outdir = utils.safe_makedir(args.outdir)
    for fnames in fastq.combine_pairs(sorted(args.files)):
        to_run.append(fnames)
    for r1, r2 in to_run:
        out1_fq = os.path.join(outdir, r1)
        out2_fq = os.path.join(outdir, r2)
        n = 0
        with utils.open_gzipsafe(r1) as r1_handle, \
             utils.open_gzipsafe(r2) as r2_handle, \
             gzip.open(out1_fq, "wb") as out1_handle, \
             gzip.open(out2_fq, "wb") as out2_handle:
            for line1, line2 in itertools.zip_longest(r1_handle, r2_handle):
                if line1 is not None:
                    if n % 4 == 0:
                        # parse header line
                        new_header1 = _add_umi_str(line1) + "\n"
                        new_header2 = _add_umi_str(line2) + "\n"
                        out1_handle.write(new_header1.encode())
                        out2_handle.write(new_header2.encode())
                    else:
                        out1_handle.write(line1.encode())
                        out2_handle.write(line2.encode())
                    n += 1
Пример #2
0
def _depth_to_seq2cov(input_fpath, output_fpath, sample_name):
    """Args:
        input_fpath: output of "mosdepth":
            chr22           14250   15500   name3   5.54
            chrM            100     1000    name1   916.08

        output_fpath: path to write results - input for Seq2C's cov2lr.pl, e.g.:
            seq2cov:
            chr20_tumor_1   DEFB125   chr20   68346   68413   Amplicon    68   28.0
            chr20_tumor_1   DEFB125   chr20   76641   77301   Amplicon    661  24.0
            chr20_tumor_1   DEFB125   chr20   68346   77301   Whole-Gene  729  24.3731138546

        sample_name:
            sample name (e.g. chr20_tumor_1)
    """
    # First round: collecting gene ends
    gene_end_by_gene = defaultdict(lambda: -1)
    with utils.open_gzipsafe(input_fpath) as f:
        for xs in (l.rstrip().split() for l in f if not l.startswith("#")):
            xs = [x for x in xs if x.strip()]
            if any(x == "." for x in xs): continue
            end = int(xs[2])
            gene_name = xs[3]
            gene_end_by_gene[gene_name] = max(gene_end_by_gene[gene_name], end)

    # Second round: calculating gene level coverage, and writing file for Seq2C
    total_cov_by_gene = dict()
    gene_start_by_gene = dict()
    total_size_by_gene = dict()

    with utils.open_gzipsafe(input_fpath) as f, open(output_fpath, 'w') as out:
        for xs in (l.rstrip().split() for l in f if not l.startswith("#")):
            xs = [x for x in xs if x.strip()]
            if any(x == "." for x in xs): continue
            chrom, start, end, gene_name = xs[:4]
            start, end = int(start), int(end)
            ave_depth = float(xs[-1])

            if gene_name not in gene_start_by_gene:
                gene_start_by_gene[gene_name] = start
                total_cov_by_gene[gene_name] = 0
                total_size_by_gene[gene_name] = 0
            else:
                gene_start_by_gene[gene_name] = min(start, gene_start_by_gene[gene_name])
            total_cov_by_gene[gene_name] += ave_depth * (end - start)
            total_size_by_gene[gene_name] += end - start

            fs = [sample_name, gene_name, chrom, str(start + 1), str(end), 'Amplicon', str(end - start), str(ave_depth)]
            out.write('\t'.join(fs) + '\n')

            if end >= gene_end_by_gene[gene_name]:
                assert end == gene_end_by_gene[gene_name], (end, gene_end_by_gene[gene_name])
                start = gene_start_by_gene[gene_name]
                ave_depth = total_cov_by_gene[gene_name] / total_size_by_gene[gene_name]
                size = total_size_by_gene[gene_name]
                fs = [sample_name, gene_name, chrom, str(start + 1), str(end), 'Whole-Gene', str(size), str(ave_depth)]
                out.write('\t'.join(fs) + '\n')
    return output_fpath
Пример #3
0
def _depth_to_seq2cov(input_fpath, output_fpath, sample_name):
    """Args:
        input_fpath: output of "mosdepth":
            chr22           14250   15500   name3   5.54
            chrM            100     1000    name1   916.08

        output_fpath: path to write results - input for Seq2C's cov2lr.pl, e.g.:
            seq2cov:
            chr20_tumor_1   DEFB125   chr20   68346   68413   Amplicon    68   28.0
            chr20_tumor_1   DEFB125   chr20   76641   77301   Amplicon    661  24.0
            chr20_tumor_1   DEFB125   chr20   68346   77301   Whole-Gene  729  24.3731138546

        sample_name:
            sample name (e.g. chr20_tumor_1)
    """
    # First round: collecting gene ends
    gene_end_by_gene = defaultdict(lambda: -1)
    with utils.open_gzipsafe(input_fpath) as f:
        for xs in (l.rstrip().split() for l in f if not l.startswith("#")):
            xs = [x for x in xs if x.strip()]
            if any(x == "." for x in xs): continue
            end = int(xs[2])
            gene_name = xs[3]
            gene_end_by_gene[gene_name] = max(gene_end_by_gene[gene_name], end)

    # Second round: calculating gene level coverage, and writing file for Seq2C
    total_cov_by_gene = dict()
    gene_start_by_gene = dict()
    total_size_by_gene = dict()

    with utils.open_gzipsafe(input_fpath) as f, open(output_fpath, 'w') as out:
        for xs in (l.rstrip().split() for l in f if not l.startswith("#")):
            xs = [x for x in xs if x.strip()]
            if any(x == "." for x in xs): continue
            chrom, start, end, gene_name = xs[:4]
            start, end = int(start), int(end)
            ave_depth = float(xs[-1])

            if gene_name not in gene_start_by_gene:
                gene_start_by_gene[gene_name] = start
                total_cov_by_gene[gene_name] = 0
                total_size_by_gene[gene_name] = 0
            else:
                gene_start_by_gene[gene_name] = min(start, gene_start_by_gene[gene_name])
            total_cov_by_gene[gene_name] += ave_depth * (end - start)
            total_size_by_gene[gene_name] += end - start

            fs = [sample_name, gene_name, chrom, str(start + 1), str(end), 'Amplicon', str(end - start), str(ave_depth)]
            out.write('\t'.join(fs) + '\n')

            if end >= gene_end_by_gene[gene_name]:
                assert end == gene_end_by_gene[gene_name], (end, gene_end_by_gene[gene_name])
                start = gene_start_by_gene[gene_name]
                ave_depth = total_cov_by_gene[gene_name] / total_size_by_gene[gene_name]
                size = total_size_by_gene[gene_name]
                fs = [sample_name, gene_name, chrom, str(start + 1), str(end), 'Whole-Gene', str(size), str(ave_depth)]
                out.write('\t'.join(fs) + '\n')
    return output_fpath
Пример #4
0
    def _calc_sizes(self, cnv_file, items):
        """Retrieve target and antitarget bin sizes based on depth.

        Similar to CNVkit's do_autobin but tries to have a standard set of
        ranges (50bp intervals for target and 10kb intervals for antitarget).
        """
        bp_per_bin = 100000  # same target as CNVkit
        range_map = {"target": (100, 250), "antitarget": (10000, 1000000)}
        target_bps = []
        anti_bps = []
        checked_beds = set([])
        for data in items:
            region_bed = tz.get_in(["depth", "variant_regions", "regions"],
                                   data)
            if region_bed and region_bed not in checked_beds:
                with utils.open_gzipsafe(region_bed) as in_handle:
                    for r in pybedtools.BedTool(in_handle).intersect(cnv_file):
                        if r.stop - r.start > range_map["target"][0]:
                            target_bps.append(float(r.name))
                with utils.open_gzipsafe(region_bed) as in_handle:
                    for r in pybedtools.BedTool(in_handle).intersect(cnv_file,
                                                                     v=True):
                        if r.stop - r.start > range_map["target"][1]:
                            anti_bps.append(float(r.name))
                checked_beds.add(region_bed)

        def scale_in_boundary(raw, round_interval, range_targets):
            min_val, max_val = range_targets
            out = int(math.ceil(raw / float(round_interval)) * round_interval)
            if out > max_val:
                return max_val
            elif out < min_val:
                return min_val
            else:
                return out

        if target_bps and np.median(target_bps) > 0:
            raw_target_bin = bp_per_bin / float(np.median(target_bps))
            target_bin = scale_in_boundary(raw_target_bin, 50,
                                           range_map["target"])
        else:
            target_bin = range_map["target"][1]

        if anti_bps and np.median(anti_bps) > 0:
            raw_anti_bin = bp_per_bin / float(np.median(anti_bps))
            anti_bin = scale_in_boundary(raw_anti_bin, 10000,
                                         range_map["antitarget"])
        else:
            anti_bin = range_map["antitarget"][1]
        return target_bin, anti_bin
Пример #5
0
def filter_vcf_by_sex(vcf_file, items):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Removes Y chromosomes from batches with all female samples.
    """
    out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        genders = list(_configured_ploidy_sex(items)[-1])
        is_female = len(genders) == 1 and genders[0] and genders[0] in ["female", "f"]
        if is_female:
            orig_out_file = out_file
            out_file = orig_out_file.replace(".vcf.gz", ".vcf")
            with file_transaction(items[0], out_file) as tx_out_file:
                with open(tx_out_file, "w") as out_handle:
                    with utils.open_gzipsafe(vcf_file) as in_handle:
                        for line in in_handle:
                            if line.startswith("#"):
                                out_handle.write(line)
                            else:
                                chrom = chromosome_special_cases(line.split("\t"))
                                if chrom != "Y":
                                    out_handle.write(line)
            if orig_out_file.endswith(".gz"):
                out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
        else:
            out_file = vcf_file
    return out_file
Пример #6
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    highdepth_beds = []
    from bcbio.variation import bedutils
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for highdepth_bed in highdepth_beds:
                            with utils.open_gzipsafe(highdepth_bed) as in_handle:
                                for line in in_handle:
                                    parts = line.split("\t")
                                    out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Пример #7
0
def _delly_count_evidence_filter(in_file, data):
    """Filter delly outputs based on read support (DV) and evidence (split and paired).

    We require DV > 4 and either both paired end and split read evidence or
    5 or more evidence for either individually.
    """
    filtname = "DVSupport"
    filtdoc = "FMT/DV < 4 || (SR < 1 && PE < 5) || (SR < 5 && PE < 1)"
    out_file = "%s-filter%s" % utils.splitext_plus(in_file)
    cur_out_file = out_file.replace(".vcf.gz", ".vcf")
    if not utils.file_exists(out_file):
        with file_transaction(data, cur_out_file) as tx_out_file:
            with utils.open_gzipsafe(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    inp = vcf.Reader(in_handle, in_file)
                    inp.filters["DVSupport"] = vcf.parser._Filter(
                        filtname, filtdoc)
                    outp = vcf.Writer(out_handle, inp)
                    for rec in inp:
                        sr = rec.INFO.get("SR", 0)
                        pe = rec.INFO.get("PE", 0)
                        call = rec.samples[0].data
                        dv = call.DV if hasattr(call, "DV") else 0
                        if dv < 4 or (sr < 1 and pe < 5) or (sr < 5
                                                             and pe < 1):
                            rec.add_filter(filtname)
                        outp.write_record(rec)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(cur_out_file, data["config"])
    return out_file
Пример #8
0
def has_regions(in_file):
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if not line.startswith(
                ("#", "track", "browser", "@")) and line.strip():
                return True
    return False
Пример #9
0
def _remove_regions(in_file, remove_beds, ext, data):
    """Subtract a list of BED files from an input BED.

    General approach handling none, one and more remove_beds.
    """
    from bcbio.variation import bedutils
    out_file = "%s-%s.bed" % (utils.splitext_plus(in_file)[0], ext)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with bedtools_tmpdir(data):
                if len(remove_beds) == 0:
                    to_remove = None
                elif len(remove_beds) == 1:
                    to_remove = remove_beds[0]
                else:
                    to_remove = "%s-all.bed" % utils.splitext_plus(
                        tx_out_file)[0]
                    with open(to_remove, "w") as out_handle:
                        for b in remove_beds:
                            with utils.open_gzipsafe(b) as in_handle:
                                for line in in_handle:
                                    parts = line.split("\t")
                                    out_handle.write(
                                        "\t".join(parts[:4]).rstrip() + "\n")
                    if utils.file_exists(to_remove):
                        to_remove = bedutils.sort_merge(to_remove, data)
                if to_remove and utils.file_exists(to_remove):
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()),
                           "Remove problematic regions: %s" % ext)
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Пример #10
0
def filter_vcf_by_sex(vcf_file, data):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Handles sex chromosomes and mitochondrial. Does not try to resolve called
    hets into potential homozygotes when converting diploid to haploid.

    Skips filtering on pooled samples, we still need to implement.
    """
    if len(vcfutils.get_samples(vcf_file)) > 1:
        return vcf_file
    _, sexes = _configured_ploidy_sex([data])
    sex = sexes.pop()
    out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if line.startswith("#"):
                            out_handle.write(line)
                        else:
                            line = _fix_line_ploidy(line, sex)
                            if line:
                                out_handle.write(line)
        if orig_out_file.endswith(".gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
def add_umis_to_fastq(out_base, read1_fq, read2_fq, umi_fq, tags=None, cores=1):
    print("Processing", read1_fq, read2_fq, umi_fq)
    out1_fq = out_base + "_R1.fq.gz"
    out2_fq = out_base + "_R2.fq.gz"
    transform_json_file = out_base + "-transform.json"
    with open(transform_json_file, "w") as out_handle:
        if tags:
            tag1, tag2 = tags
            out_handle.write(duplex_transform % (tag1, tag1, tag2, tag2))
        else:
            out_handle.write(transform_json)
    with utils.open_gzipsafe(read1_fq) as in_handle:
        ex_name = in_handle.readline().split(" ")
        fastq_tags_arg = "--keep_fastq_tags" if len(ex_name) == 2 else ""
    tag_arg = "--separate_cb" if tags else ""
    cmd = ("umis fastqtransform {fastq_tags_arg} {tag_arg} "
           "--fastq1out >(bgzip --threads {cores} -c > {out1_fq}) "
           "--fastq2out >(bgzip --threads {cores} -c > {out2_fq}) "
           "{transform_json_file} {read1_fq} "
           "{read2_fq}")
    if umi_fq:
        cmd += " {umi_fq}"
    do.run(cmd.format(**locals()), "Add UMIs to paired fastq files")

    os.remove(transform_json_file)
Пример #12
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    highdepth_beds = []
    from bcbio.variation import bedutils
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for highdepth_bed in highdepth_beds:
                            with utils.open_gzipsafe(highdepth_bed) as in_handle:
                                for line in in_handle:
                                    parts = line.split("\t")
                                    out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Пример #13
0
def _fix_mutect_output(orig_file, config, out_file, is_paired):
    """Adjust MuTect output to match other callers.

    - Rename allelic fraction field in mutect output from FA to FREQ to standarize with other tools
    - Remove extra 'none' samples introduced when calling tumor-only samples
    """
    out_file_noc = out_file.replace(".vcf.gz", ".vcf")
    none_index = -1
    with file_transaction(config, out_file_noc) as tx_out_file:
        with open_gzipsafe(orig_file) as in_handle:
            with open(tx_out_file, 'w') as out_handle:
                for line in in_handle:
                    if not is_paired and line.startswith("#CHROM"):
                        parts = line.rstrip().split("\t")
                        none_index = parts.index("none")
                        del parts[none_index]
                        line = "\t".join(parts) + "\n"
                    elif line.startswith("##FORMAT=<ID=FA"):
                        line = line.replace("=FA", "=FREQ")
                    elif not line.startswith("#"):
                        if none_index > 0:
                            parts = line.rstrip().split("\t")
                            del parts[none_index]
                            line = "\t".join(parts) + "\n"
                        line = line.replace("FA", "FREQ")
                    out_handle.write(line)
    return bgzip_and_index(out_file_noc, config)
Пример #14
0
def _filter_by_background(base_samples, back_samples, gt_vcfs, data):
    """Filter base samples, marking any also present in the background.
    """
    filtname = "InBackground"
    filtdoc = "Variant also present in background samples with same genotype"
    for base_name in base_samples:
        orig_vcf = gt_vcfs[base_name]
        out_file = "%s-backfilter.vcf" % (utils.splitext_plus(orig_vcf)[0])
        if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
            with file_transaction(data, out_file) as tx_out_file:
                with utils.open_gzipsafe(orig_vcf) as in_handle:
                    with _vcf_readers([gt_vcfs[n] for n in back_samples]) as back_readers:
                        inp = vcf.Reader(in_handle, orig_vcf)
                        inp.filters[filtname] = vcf.parser._Filter(filtname, filtdoc)
                        with open(tx_out_file, "w") as out_handle:
                            outp = vcf.Writer(out_handle, inp)
                            for rec in inp:
                                back_recs = [r.next() for r in back_readers]
                                if _genotype_in_background(rec, back_recs):
                                    rec.add_filter(filtname)
                                outp.write_record(rec)
        if utils.file_exists(out_file + ".gz"):
            out_file = out_file + ".gz"
        gt_vcfs[base_name] = vcfutils.bgzip_and_index(out_file, data["config"])
    return gt_vcfs
Пример #15
0
def _run_svtyper(in_file, full_bam, exclude_file, data):
    """Genotype structural variant calls with SVtyper.

    Removes calls in high depth regions to avoid slow runtimes:
    https://github.com/hall-lab/svtyper/issues/16
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper")
                if exclude_file and utils.file_exists(exclude_file):
                    regions_to_rm = "-T ^%s" % (exclude_file)
                else:
                    regions_to_rm = ""
                # add FILTER headers, which are lost during svtyping
                header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
                with open(header_file, "w") as out_handle:
                    with utils.open_gzipsafe(in_file) as in_handle:
                        for line in in_handle:
                            if not line.startswith("#"):
                                break
                            if line.startswith("##FILTER"):
                                out_handle.write(line)
                    for region in ref.file_contigs(dd.get_ref_file(data), data["config"]):
                        out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size))
                cmd = ("bcftools view {in_file} {regions_to_rm} | "
                       "{python} {svtyper} --max_reads 1000 -B {full_bam} | "
                       "bcftools annotate -h {header_file} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Пример #16
0
def slim_vcf(in_file, data):
    """Remove larger annotations which slow down VCF processing
    """
    to_remove = ["ANN", "LOF"]
    to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove])
    in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False)
    out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        cur_remove = []
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith("#"):
                    break
                elif line.startswith(to_remove_str):
                    cur_id = line.split("ID=")[-1].split(",")[0]
                    cur_remove.append("INFO/%s" % cur_id)
        with file_transaction(data, out_file) as tx_out_file:
            if cur_remove:
                cur_remove = ",".join(cur_remove)
                cmd = ("bcftools view -f 'PASS,.' {in_file} | "
                       "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}")
            else:
                cmd = ("bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}")
            do.run(cmd.format(**locals()), "Create slim VCF")
    return out_file
Пример #17
0
def check_bed_coords(in_file, data):
    """Ensure BED file coordinates match reference genome.

    Catches errors like using a hg38 BED file for an hg19 genome run.
    """
    if dd.get_ref_file(data):
        contig_sizes = {}
        for contig in ref.file_contigs(dd.get_ref_file(data)):
            contig_sizes[contig.name] = contig.size
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith(
                    ("#", "track", "browser", "@")) and line.strip():
                    parts = line.split()
                    if len(parts) > 3:
                        try:
                            end = int(parts[2])
                        except ValueError:
                            continue
                        contig = parts[0]
                        check_size = contig_sizes.get(contig)
                        if check_size and end > check_size:
                            raise ValueError(
                                "Found BED coordinate off the end of the chromosome:\n%s%s\n"
                                "Is the input BED from the right genome build?"
                                % (line, in_file))
Пример #18
0
def coverage_interval_from_bed(bed_file):
    """Calculate a coverage interval for the current region BED.

    This helps correctly work with cases of uneven coverage across an analysis
    genome. strelka2 and other model based callers have flags for targeted and non
    which depend on the local context.
    """
    total_bases = 0
    bed_bases = 0
    cur_chr = None
    chr_start = None
    last_end = None
    with utils.open_gzipsafe(bed_file) as in_handle:
        for line in in_handle:
            parts = line.split()
            if len(parts) >= 3:
                chrom, start, end = parts[:3]
                start = int(start)
                end = int(end)
                bed_bases += (end - start)
                if chrom != cur_chr:
                    if cur_chr and last_end and cur_start is not None:
                        total_bases += (last_end - cur_start)
                    cur_chr = chrom
                    cur_start = int(start)
                last_end = end
        if cur_chr and last_end and cur_start is not None:
            total_bases += (last_end - cur_start)
    # Should be importing GENOME_COV_THRESH but get circular imports
    if float(bed_bases) / float(total_bases) >= 0.40:
        return "genome"
    else:
        return "targeted"
Пример #19
0
def _calculate_comparison_stats(truth_vcf):
    """Identify calls to validate from the input truth VCF.
    """
    # Avoid very small events for average calculations
    min_stat_size = 50
    min_median_size = 250
    sizes = []
    svtypes = set([])
    with utils.open_gzipsafe(truth_vcf) as in_handle:
        for call in (l.rstrip().split("\t") for l in in_handle
                     if not l.startswith("#")):
            stats = _summarize_call(call)
            if stats["size"] > min_stat_size:
                sizes.append(stats["size"])
            svtypes.add(stats["svtype"])
    pct10 = int(np.percentile(sizes, 10))
    pct25 = int(np.percentile(sizes, 25))
    pct50 = int(np.percentile(sizes, 50))
    pct75 = int(np.percentile(sizes, 75))
    ranges_detailed = [(int(min(sizes)), pct10), (pct10, pct25),
                       (pct25, pct50), (pct50, pct75), (pct75, max(sizes))]
    ranges_split = [(int(min(sizes)), pct50), (pct50, max(sizes))]
    return {
        "min_size":
        int(min(sizes) * 0.95),
        "max_size":
        int(max(sizes) + 1.05),
        "svtypes":
        svtypes,
        "merge_size":
        int(np.percentile([x for x in sizes if x > min_median_size], 50)),
        "ranges": []
    }
Пример #20
0
def _callable_intersect(in_file, callable_bed, data):
    """Return list of original VCF SVs intersected by callable regions.

    Does not try to handle BNDs. We should resolve these and return where possible.
    """
    with tx_tmpdir(data) as tmpdir:
        in_bed = os.path.join(
            tmpdir, "%s-convert.bed" %
            utils.splitext_plus(os.path.basename(in_file))[0])
        with utils.open_gzipsafe(in_file) as in_handle:
            with open(in_bed, "w") as out_handle:
                for parts in (l.split("\t") for l in in_handle
                              if not l.startswith("#")):
                    start, end = _get_start_end(parts)
                    if end:
                        out_handle.write("\t".join([parts[0], start, end] +
                                                   parts) + "\n")
        out_file = os.path.join(
            tmpdir, "%s-subset.tsv" %
            utils.splitext_plus(os.path.basename(in_file))[0])
        cmd = "bedtools intersect -a {in_bed} -b {callable_bed} -wa -wb > {out_file}"
        do.run(cmd.format(**locals()), "Intersect VCF by callable")
        with open(out_file) as in_handle:
            for line in in_handle:
                yield line.rstrip().split("\t")[3:]
Пример #21
0
def vcf_has_variants(in_file):
    if os.path.exists(in_file):
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if line.strip() and not line.startswith("#"):
                    return True
    return False
Пример #22
0
def open_fastq(in_file):
    """ open a fastq file, using gzip if it is gzipped
    """
    if objectstore.is_remote(in_file):
        return objectstore.open_file(in_file)
    else:
        return utils.open_gzipsafe(in_file)
Пример #23
0
def open_fastq(in_file):
    """ open a fastq file, using gzip if it is gzipped
    """
    if objectstore.is_remote(in_file):
        return objectstore.open_file(in_file)
    else:
        return utils.open_gzipsafe(in_file)
Пример #24
0
def _fix_mutect_output(orig_file, config, out_file, is_paired):
    """Adjust MuTect output to match other callers.

    - Rename allelic fraction field in mutect output from FA to FREQ to standarize with other tools
    - Remove extra 'none' samples introduced when calling tumor-only samples
    """
    out_file_noc = out_file.replace(".vcf.gz", ".vcf")
    none_index = -1
    with file_transaction(config, out_file_noc) as tx_out_file:
        with open_gzipsafe(orig_file) as in_handle:
            with open(tx_out_file, 'w') as out_handle:
                for line in in_handle:
                    if not is_paired and line.startswith("#CHROM"):
                        parts = line.rstrip().split("\t")
                        none_index = parts.index("none")
                        del parts[none_index]
                        line = "\t".join(parts) + "\n"
                    elif line.startswith("##FORMAT=<ID=FA"):
                        line = line.replace("=FA", "=FREQ")
                    elif not line.startswith("#"):
                        if none_index > 0:
                            parts = line.rstrip().split("\t")
                            del parts[none_index]
                            line = "\t".join(parts) + "\n"
                        line = line.replace("FA", "FREQ")
                    out_handle.write(line)
    return bgzip_and_index(out_file_noc, config)
Пример #25
0
def decorate_problem_regions(query_bed, problem_bed_dir):
    """
    decorate query_bed with percentage covered by BED files of regions specified
    in the problem_bed_dir
    """
    if is_gzipped(query_bed):
        stem, _ = os.path.splitext(query_bed)
        stem, ext = os.path.splitext(stem)
    else:
        stem, ext = os.path.splitext(query_bed)
    out_file = stem + ".problem_annotated" + ext + ".gz"
    if file_exists(out_file):
        return out_file
    bed_files = _find_bed_files(problem_bed_dir)
    bed_file_string = " ".join(bed_files)
    names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files]
    names_string = " ".join(names)
    with open_gzipsafe(query_bed) as in_handle:
        header = map(str, in_handle.next().strip().split())
    header = "\t".join(header + names)
    cmd = (
        "bedtools annotate -i {query_bed} -files {bed_file_string} "
        "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}"
    )
    with file_transaction(out_file) as tx_out_file:
        message = "Annotate %s with problem regions." % query_bed
        do.run(cmd.format(**locals()), message)
    return out_file
Пример #26
0
def _filter_by_background(base_samples, back_samples, gt_vcfs, data):
    """Filter base samples, marking any also present in the background.
    """
    filtname = "InBackground"
    filtdoc = "Variant also present in background samples with same genotype"
    for base_name in base_samples:
        orig_vcf = gt_vcfs[base_name]
        out_file = "%s-backfilter.vcf" % (utils.splitext_plus(orig_vcf)[0])
        if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                     ".gz"):
            with file_transaction(data, out_file) as tx_out_file:
                with utils.open_gzipsafe(orig_vcf) as in_handle:
                    with _vcf_readers([gt_vcfs[n]
                                       for n in back_samples]) as back_readers:
                        inp = vcf.Reader(in_handle, orig_vcf)
                        inp.filters[filtname] = vcf.parser._Filter(
                            filtname, filtdoc)
                        with open(tx_out_file, "w") as out_handle:
                            outp = vcf.Writer(out_handle, inp)
                            for rec in inp:
                                back_recs = [r.next() for r in back_readers]
                                if _genotype_in_background(rec, back_recs):
                                    rec.add_filter(filtname)
                                outp.write_record(rec)
        if utils.file_exists(out_file + ".gz"):
            out_file = out_file + ".gz"
        gt_vcfs[base_name] = vcfutils.bgzip_and_index(out_file, data["config"])
    return gt_vcfs
Пример #27
0
def _delly_count_evidence_filter(in_file, data):
    """Filter delly outputs based on read support (DV) and evidence (split and paired).

    We require DV > 4 and either both paired end and split read evidence or
    5 or more evidence for either individually.
    """
    filtname = "DVSupport"
    filtdoc = "FMT/DV < 4 || (SR < 1 && PE < 5) || (SR < 5 && PE < 1)"
    out_file = "%s-filter%s" % utils.splitext_plus(in_file)
    cur_out_file = out_file.replace(".vcf.gz", ".vcf")
    if not utils.file_exists(out_file):
        with file_transaction(data, cur_out_file) as tx_out_file:
            with utils.open_gzipsafe(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    inp = vcf.Reader(in_handle, in_file)
                    inp.filters["DVSupport"] = vcf.parser._Filter(filtname, filtdoc)
                    outp = vcf.Writer(out_handle, inp)
                    for rec in inp:
                        sr = rec.INFO.get("SR", 0)
                        pe = rec.INFO.get("PE", 0)
                        call = rec.samples[0].data
                        dv = call.DV if hasattr(call, "DV") else 0
                        if dv < 4 or (sr < 1 and pe < 5) or (sr < 5 and pe < 1):
                            rec.add_filter(filtname)
                        outp.write_record(rec)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(cur_out_file, data["config"])
    return out_file
Пример #28
0
def filter_vcf_by_sex(vcf_file, items):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Removes Y chromosomes from batches with all female samples.
    """
    out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        genders = list(_configured_ploidy_sex(items)[-1])
        is_female = len(genders) == 1 and genders[0] and genders[0] in [
            "female", "f"
        ]
        if is_female:
            orig_out_file = out_file
            out_file = orig_out_file.replace(".vcf.gz", ".vcf")
            with file_transaction(items[0], out_file) as tx_out_file:
                with open(tx_out_file, "w") as out_handle:
                    with utils.open_gzipsafe(vcf_file) as in_handle:
                        for line in in_handle:
                            if line.startswith("#"):
                                out_handle.write(line)
                            else:
                                chrom = chromosome_special_cases(
                                    line.split("\t"))
                                if chrom != "Y":
                                    out_handle.write(line)
            if orig_out_file.endswith(".gz"):
                out_file = vcfutils.bgzip_and_index(out_file,
                                                    items[0]["config"])
        else:
            out_file = vcf_file
    return out_file
Пример #29
0
def _calculate_comparison_stats(truth_vcf):
    """Identify calls to validate from the input truth VCF.
    """
    sizes = []
    svtypes = set([])
    with utils.open_gzipsafe(truth_vcf) as in_handle:
        for call in (l.rstrip().split("\t") for l in in_handle
                     if not l.startswith("#")):
            stats = _summarize_call(tuple(call[:5] + call[7:8]))
            sizes.append(stats["size"])
            svtypes.add(stats["svtype"])
    pct10 = int(np.percentile(sizes, 10))
    pct25 = int(np.percentile(sizes, 25))
    pct50 = int(np.percentile(sizes, 50))
    pct75 = int(np.percentile(sizes, 75))
    ranges_detailed = [(int(min(sizes)), pct10), (pct10, pct25),
                       (pct25, pct50), (pct50, pct75), (pct75, max(sizes))]
    ranges_split = [(int(min(sizes)), pct50), (pct50, max(sizes))]
    return {
        "min_size": int(min(sizes) * 0.95),
        "max_size": int(max(sizes) + 1.05),
        "svtypes": svtypes,
        "merge_size": int(np.percentile(sizes, 10)),
        "ranges": []
    }
Пример #30
0
def tx2genedict(gtf, keep_version=False):
    """
    produce a tx2gene dictionary from a GTF file
    """
    d = {}
    with open_gzipsafe(gtf) as in_handle:
        for line in in_handle:
            if "gene_id" not in line or "transcript_id" not in line:
                continue
            geneid = line.split("gene_id")[1].split(" ")[1]
            geneid = _strip_non_alphanumeric(geneid)
            if not geneid:
                continue
            txid = line.split("transcript_id")[1].split(" ")[1]
            txid = _strip_non_alphanumeric(txid)
            if keep_version and "transcript_version" in line:
                txversion = line.split("transcript_version")[1].split(" ")[1]
                txversion = _strip_non_alphanumeric(txversion)
                txid  += "." + txversion
            if has_transcript_version(line) and not keep_version:
                txid = _strip_feature_version(txid)
                geneid = _strip_feature_version(geneid)
            txid = txid.strip()
            geneid = geneid.strip()
            if not txid or not geneid:
                continue
            d[txid] = geneid
    return d
Пример #31
0
def _add_log2_depth(in_file, out_file, data):
    """Create a CNVkit cnn file with depths
    http://cnvkit.readthedocs.io/en/stable/fileformats.html?highlight=cnn#target-and-antitarget-bin-level-coverages-cnn
    """
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with utils.open_gzipsafe(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(
                        "chromosome\tstart\tend\tgene\tlog2\tdepth\n")
                    for line in in_handle:
                        parts = line.rstrip().split()
                        if len(parts) > 4:
                            # Handle inputs unannotated with gene names
                            if len(parts) == 5:
                                chrom, start, end, orig_name, depth = parts
                                gene_name = "."
                            else:
                                assert len(parts) == 6, parts
                                chrom, start, end, orig_name, depth, gene_name = parts
                            depth = float(depth)
                            log2_depth = math.log(float(depth),
                                                  2) if depth else -20.0
                            out_handle.write("%s\t%s\t%s\t%s\t%.3f\t%.2f\n" %
                                             (chrom, start, end, gene_name,
                                              log2_depth, depth))
    return out_file
Пример #32
0
def _filter_by_bedpe(vcf_file, bedpe_file, data):
    """Add filters to VCF based on pre-filtered bedpe file.
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vcf_file)
    nogzip_out_file = out_file.replace(".vcf.gz", ".vcf")
    if not utils.file_exists(out_file):
        filters = {}
        with open(bedpe_file) as in_handle:
            for line in in_handle:
                parts = line.split("\t")
                name = parts[6]
                cur_filter = parts[-1].strip()
                if cur_filter != "PASS":
                    filters[name] = cur_filter
        with file_transaction(nogzip_out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if not line.startswith("#"):
                            parts = line.split("\t")
                            cur_id = parts[2].split("_")[0]
                            cur_filter = filters.get(cur_id, "PASS")
                            if cur_filter != "PASS":
                                parts[6] = cur_filter
                            line = "\t".join(parts)
                        out_handle.write(line)
        if out_file.endswith(".gz"):
            vcfutils.bgzip_and_index(nogzip_out_file, data["config"])
    return out_file
Пример #33
0
def _filter_by_bedpe(vcf_file, bedpe_file, data):
    """Add filters to VCF based on pre-filtered bedpe file.

    Also removes problem calls in the output VCF with missing alleles.
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vcf_file)
    nogzip_out_file = out_file.replace(".vcf.gz", ".vcf")
    if not utils.file_exists(out_file):
        filters = {}
        with open(bedpe_file) as in_handle:
            for line in in_handle:
                parts = line.split("\t")
                name = parts[6]
                cur_filter = parts[-1].strip()
                if cur_filter != "PASS":
                    filters[name] = cur_filter
        with file_transaction(data, nogzip_out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if not line.startswith("#"):
                            parts = line.split("\t")
                            # Problem breakends can have empty alleles when at contig ends
                            if not parts[3].strip():
                                parts[3] = "N"
                            cur_id = parts[2].split("_")[0]
                            cur_filter = filters.get(cur_id, "PASS")
                            if cur_filter != "PASS":
                                parts[6] = cur_filter
                            line = "\t".join(parts)
                        out_handle.write(line)
        if out_file.endswith(".gz"):
            vcfutils.bgzip_and_index(nogzip_out_file, data["config"])
    return out_file
Пример #34
0
def slim_vcf(in_file, data):
    """Remove larger annotations which slow down VCF processing
    """
    to_remove = ["ANN", "LOF"]
    to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove])
    in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False)
    out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        cur_remove = []
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith("#"):
                    break
                elif line.startswith(to_remove_str):
                    cur_id = line.split("ID=")[-1].split(",")[0]
                    cur_remove.append("INFO/%s" % cur_id)
        with file_transaction(data, out_file) as tx_out_file:
            if cur_remove:
                cur_remove = ",".join(cur_remove)
                cmd = (
                    "bcftools view -f 'PASS,.' {in_file} | "
                    "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}")
            else:
                cmd = (
                    "bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}"
                )
            do.run(cmd.format(**locals()), "Create slim VCF")
    return out_file
Пример #35
0
def add_umis_to_fastq(out_base,
                      read1_fq,
                      read2_fq,
                      umi_fq,
                      tags=None,
                      cores=1):
    print("Processing", read1_fq, read2_fq, umi_fq)
    out1_fq = out_base + "_R1.fq.gz"
    out2_fq = out_base + "_R2.fq.gz"
    transform_json_file = out_base + "-transform.json"
    with open(transform_json_file, "w") as out_handle:
        if tags:
            tag1, tag2 = tags
            out_handle.write(duplex_transform % (tag1, tag1, tag2, tag2))
        else:
            out_handle.write(transform_json)
    with utils.open_gzipsafe(read1_fq) as in_handle:
        ex_name = in_handle.readline().split(" ")
        fastq_tags_arg = "--keep_fastq_tags" if len(ex_name) == 2 else ""
    tag_arg = "--separate_cb" if tags else ""
    cmd = ("umis fastqtransform {fastq_tags_arg} {tag_arg} "
           "--fastq1out >(bgzip --threads {cores} -c > {out1_fq}) "
           "--fastq2out >(bgzip --threads {cores} -c > {out2_fq}) "
           "{transform_json_file} {read1_fq} "
           "{read2_fq}")
    if umi_fq:
        cmd += " {umi_fq}"
    do.run(cmd.format(**locals()), "Add UMIs to paired fastq files")

    os.remove(transform_json_file)
Пример #36
0
def decorate_problem_regions(query_bed, problem_bed_dir):
    """
    decorate query_bed with percentage covered by BED files of regions specified
    in the problem_bed_dir
    """
    if utils.is_gzipped(query_bed):
        stem, _ = os.path.splitext(query_bed)
        stem, ext = os.path.splitext(stem)
    else:
        stem, ext = os.path.splitext(query_bed)
    out_file = stem + ".problem_annotated" + ext + ".gz"
    if utils.file_exists(out_file):
        return out_file
    bed_files = glob.glob(os.path.join(problem_bed_dir, "*.bed"))
    bed_file_string = " ".join(bed_files)
    names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files]
    names_string = " ".join(names)
    with utils.open_gzipsafe(query_bed) as in_handle:
        header = map(str, in_handle.next().strip().split())
    header = "\t".join(header + names)
    cmd = ("bedtools annotate -i {query_bed} -files {bed_file_string} "
           "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        message = "Annotate %s with problem regions." % query_bed
        do.run(cmd.format(**locals()), message)
    return out_file
Пример #37
0
def filter_vcf_by_sex(vcf_file, data):
    """Post-filter a single sample VCF, handling sex chromosomes.

    Handles sex chromosomes and mitochondrial. Does not try to resolve called
    hets into potential homozygotes when converting diploid to haploid.

    Skips filtering on cancer samples. Since these will be pooled, need special
    functionality to handle them
    """
    if vcfutils.get_paired_phenotype(data):
        return vcf_file
    _, sexes = _configured_ploidy_sex([data])
    sex = sexes.pop()
    out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file)
    if not utils.file_exists(out_file):
        orig_out_file = out_file
        out_file = orig_out_file.replace(".vcf.gz", ".vcf")
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if line.startswith("#"):
                            out_handle.write(line)
                        else:
                            line = _fix_line_ploidy(line, sex)
                            if line:
                                out_handle.write(line)
        if orig_out_file.endswith(".gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Пример #38
0
def get_normal_sample(in_file):
    """Retrieve normal sample if normal/turmor
    """
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if line.startswith("##PEDIGREE"):
                parts = line.strip().split("Original=")[1][:-1]
                return parts
Пример #39
0
def vcf_has_nonfiltered_variants(in_file):
    if os.path.exists(in_file):
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if line.strip() and not line.startswith("#"):
                    parts = line.split("\t")
                    if parts[6] in set(["PASS", "."]):
                        return True
    return False
Пример #40
0
def get_samples(in_file):
    """Retrieve samples present in a VCF file
    """
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if line.startswith("#CHROM"):
                parts = line.strip().split("\t")
                return parts[9:]
    raise ValueError("Did not find sample header in VCF file %s" % in_file)
Пример #41
0
def _subset_to_variant_regions(callable_file, variant_regions, data):
    """Subset output callable file to only variant regions of interest.
    """
    out_file = "%s-vrsubset.bed" % utils.splitext_plus(callable_file)[0]
    if not utils.file_uptodate(out_file, callable_file):
        with file_transaction(data, out_file) as tx_out_file:
            with utils.open_gzipsafe(callable_file) as in_handle:
                pybedtools.BedTool(in_handle).intersect(variant_regions).saveas(tx_out_file)
    return out_file
Пример #42
0
def get_samples(in_file):
    """Retrieve samples present in a VCF file
    """
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if line.startswith("#CHROM"):
                parts = line.strip().split("\t")
                return parts[9:]
    raise ValueError("Did not find sample header in VCF file %s" % in_file)
Пример #43
0
    def _calc_sizes(self, cnv_file, items):
        """Retrieve target and antitarget bin sizes based on depth.

        Similar to CNVkit's do_autobin but tries to have a standard set of
        ranges (50bp intervals for target and 10kb intervals for antitarget).
        """
        bp_per_bin = 100000  # same target as CNVkit
        range_map = {"target": (100, 250), "antitarget": (10000, 1000000)}
        target_bps = []
        anti_bps = []
        checked_beds = set([])
        for data in items:
            region_bed = tz.get_in(["depth", "variant_regions", "regions"], data)
            if region_bed and region_bed not in checked_beds:
                with utils.open_gzipsafe(region_bed) as in_handle:
                    for r in pybedtools.BedTool(in_handle).intersect(cnv_file):
                        if r.stop - r.start > range_map["target"][0]:
                            target_bps.append(float(r.name))
                with utils.open_gzipsafe(region_bed) as in_handle:
                    for r in pybedtools.BedTool(in_handle).intersect(cnv_file, v=True):
                        if r.stop - r.start > range_map["target"][1]:
                            anti_bps.append(float(r.name))
                checked_beds.add(region_bed)
        def scale_in_boundary(raw, round_interval, range_targets):
            min_val, max_val = range_targets
            out = int(math.ceil(raw / float(round_interval)) * round_interval)
            if out > max_val:
                return max_val
            elif out < min_val:
                return min_val
            else:
                return out
        if target_bps and np.median(target_bps) > 0:
            raw_target_bin = bp_per_bin / float(np.median(target_bps))
            target_bin = scale_in_boundary(raw_target_bin, 50, range_map["target"])
        else:
            target_bin = range_map["target"][1]

        if anti_bps and np.median(anti_bps) > 0:
            raw_anti_bin = bp_per_bin / float(np.median(anti_bps))
            anti_bin = scale_in_boundary(raw_anti_bin, 10000, range_map["antitarget"])
        else:
            anti_bin = range_map["antitarget"][1]
        return target_bin, anti_bin
def _find_filtered(fname, extra):
    """Identify the filtered inputs in the original VCF file.
    """
    filtered = 0
    with utils.open_gzipsafe(fname) as in_handle:
        for rec in vcf.Reader(in_handle, fname):
            if "LowPriority" in rec.FILTER:
                filtered += 1
    enrichment = "%sx" % (int((extra + filtered) / float(extra)))
    return enrichment, filtered
Пример #45
0
def is_gene_list(bed_file):
    """Check if the file is only a list of genes, not a BED
    """
    with utils.open_gzipsafe(bed_file) as in_handle:
        for line in in_handle:
            if not line.startswith("#"):
                if len(line.split()) == 1:
                    return True
                else:
                    return False
Пример #46
0
def _vcf_readers(vcf_files):
    handles = []
    readers = []
    for vcf_file in vcf_files:
        in_handle = utils.open_gzipsafe(vcf_file)
        handles.append(in_handle)
        readers.append(vcf.Reader(in_handle, vcf_file))
    yield readers
    for handle in handles:
        handle.close()
Пример #47
0
def _vcf_readers(vcf_files):
    handles = []
    readers = []
    for vcf_file in vcf_files:
        in_handle = utils.open_gzipsafe(vcf_file)
        handles.append(in_handle)
        readers.append(vcf.Reader(in_handle, vcf_file))
    yield readers
    for handle in handles:
        handle.close()
Пример #48
0
def _vcf_to_bed(in_file, caller, out_file):
    if in_file.endswith((".vcf", "vcf.gz")):
        with utils.open_gzipsafe(in_file) as in_handle:
            with open(out_file, "w") as out_handle:
                for rec in vcf.Reader(in_handle, in_file):
                    if not rec.FILTER:
                        if not (hasattr(rec.samples[0].data, "FT") and rec.samples[0].data.FT):
                            out_handle.write("\t".join([rec.CHROM, str(rec.start - 1), str(rec.INFO["END"]),
                                                        "%s_%s" % (rec.INFO["SVTYPE"], caller)])
                                             + "\n")
Пример #49
0
def _average_called_depth(in_file):
    """Retrieve the average depth of called reads in the provided VCF.
    """
    depths = []
    with utils.open_gzipsafe(in_file) as in_handle:
        reader = vcf.Reader(in_handle, in_file)
        for rec in reader:
            d = rec.INFO.get("DP")
            if d is not None:
                depths.append(d)
    return int(math.ceil(numpy.mean(depths)))
Пример #50
0
def _add_umis_with_fastp(read_fq, umi_fq, out_fq, cores):
    """Add UMIs to reads from separate UMI file using fastp.
    """
    with utils.open_gzipsafe(umi_fq) as in_handle:
        in_handle.readline()  # name
        umi_size = len(in_handle.readline().strip())
    cmd = ("fastp -Q -A -L -G -w 1 --in1 {read_fq} --in2 {umi_fq} "
           "--umi --umi_prefix UMI --umi_loc read2 --umi_len {umi_size} "
           "--out1 >(bgzip --threads {cores} -c > {out_fq}) --out2 /dev/null "
           "-j /dev/null -h /dev/null")
    do.run(cmd.format(**locals()), "Add UMIs to fastq file with fastp")
Пример #51
0
def _add_contig_cl(in_file, items):
    has_contigs = False
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if line.startswith("##contig"):
                has_contigs = True
                break
            elif not line.startswith("##"):
                break
    if not has_contigs:
        return vcfutils.add_contig_to_header_cl(items[0])
Пример #52
0
def _is_small_vcf(vcf_file):
    """Check for small VCFs which we want to analyze quicker.
    """
    count = 0
    small_thresh = 250
    with utils.open_gzipsafe(vcf_file) as in_handle:
        for line in in_handle:
            if not line.startswith("#"):
                count += 1
            if count > small_thresh:
                return False
    return True
Пример #53
0
def check_bed_contigs(in_file, data):
    """Ensure BED file contigs match the reference genome.
    """
    contigs = set([])
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if not line.startswith(("#", "track", "browser")) and line.strip():
                contigs.add(line.split()[0])
    ref_contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))])
    if len(contigs - ref_contigs) / float(len(contigs)) > 0.25:
        raise ValueError("Contigs in BED file %s not in reference genome:\n %s\n"
                         % (in_file, list(contigs - ref_contigs)) +
                         "This is typically due to chr1 versus 1 differences in BED file and reference.")
Пример #54
0
def _civic_regions(civic_file, variant_types=None, diseases=None, drugs=None):
    """Retrieve gene regions and names filtered by variant_types and diseases.
    """
    if isinstance(diseases, six.string_types):
        diseases = [diseases]
    with utils.open_gzipsafe(civic_file) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        for chrom, start, end, info_str in reader:
            info = edn_loads(info_str)
            if not variant_types or _matches(info["support"]["variants"], variant_types):
                if not diseases or _matches(info["support"]["diseases"], diseases):
                    if not drugs or _matches(info["support"]["drugs"], drugs):
                        yield (chrom, int(start), int(end), list(info["name"])[0])