def run_dragen(args): to_run = [] outdir = utils.safe_makedir(args.outdir) for fnames in fastq.combine_pairs(sorted(args.files)): to_run.append(fnames) for r1, r2 in to_run: out1_fq = os.path.join(outdir, r1) out2_fq = os.path.join(outdir, r2) n = 0 with utils.open_gzipsafe(r1) as r1_handle, \ utils.open_gzipsafe(r2) as r2_handle, \ gzip.open(out1_fq, "wb") as out1_handle, \ gzip.open(out2_fq, "wb") as out2_handle: for line1, line2 in itertools.zip_longest(r1_handle, r2_handle): if line1 is not None: if n % 4 == 0: # parse header line new_header1 = _add_umi_str(line1) + "\n" new_header2 = _add_umi_str(line2) + "\n" out1_handle.write(new_header1.encode()) out2_handle.write(new_header2.encode()) else: out1_handle.write(line1.encode()) out2_handle.write(line2.encode()) n += 1
def _depth_to_seq2cov(input_fpath, output_fpath, sample_name): """Args: input_fpath: output of "mosdepth": chr22 14250 15500 name3 5.54 chrM 100 1000 name1 916.08 output_fpath: path to write results - input for Seq2C's cov2lr.pl, e.g.: seq2cov: chr20_tumor_1 DEFB125 chr20 68346 68413 Amplicon 68 28.0 chr20_tumor_1 DEFB125 chr20 76641 77301 Amplicon 661 24.0 chr20_tumor_1 DEFB125 chr20 68346 77301 Whole-Gene 729 24.3731138546 sample_name: sample name (e.g. chr20_tumor_1) """ # First round: collecting gene ends gene_end_by_gene = defaultdict(lambda: -1) with utils.open_gzipsafe(input_fpath) as f: for xs in (l.rstrip().split() for l in f if not l.startswith("#")): xs = [x for x in xs if x.strip()] if any(x == "." for x in xs): continue end = int(xs[2]) gene_name = xs[3] gene_end_by_gene[gene_name] = max(gene_end_by_gene[gene_name], end) # Second round: calculating gene level coverage, and writing file for Seq2C total_cov_by_gene = dict() gene_start_by_gene = dict() total_size_by_gene = dict() with utils.open_gzipsafe(input_fpath) as f, open(output_fpath, 'w') as out: for xs in (l.rstrip().split() for l in f if not l.startswith("#")): xs = [x for x in xs if x.strip()] if any(x == "." for x in xs): continue chrom, start, end, gene_name = xs[:4] start, end = int(start), int(end) ave_depth = float(xs[-1]) if gene_name not in gene_start_by_gene: gene_start_by_gene[gene_name] = start total_cov_by_gene[gene_name] = 0 total_size_by_gene[gene_name] = 0 else: gene_start_by_gene[gene_name] = min(start, gene_start_by_gene[gene_name]) total_cov_by_gene[gene_name] += ave_depth * (end - start) total_size_by_gene[gene_name] += end - start fs = [sample_name, gene_name, chrom, str(start + 1), str(end), 'Amplicon', str(end - start), str(ave_depth)] out.write('\t'.join(fs) + '\n') if end >= gene_end_by_gene[gene_name]: assert end == gene_end_by_gene[gene_name], (end, gene_end_by_gene[gene_name]) start = gene_start_by_gene[gene_name] ave_depth = total_cov_by_gene[gene_name] / total_size_by_gene[gene_name] size = total_size_by_gene[gene_name] fs = [sample_name, gene_name, chrom, str(start + 1), str(end), 'Whole-Gene', str(size), str(ave_depth)] out.write('\t'.join(fs) + '\n') return output_fpath
def _calc_sizes(self, cnv_file, items): """Retrieve target and antitarget bin sizes based on depth. Similar to CNVkit's do_autobin but tries to have a standard set of ranges (50bp intervals for target and 10kb intervals for antitarget). """ bp_per_bin = 100000 # same target as CNVkit range_map = {"target": (100, 250), "antitarget": (10000, 1000000)} target_bps = [] anti_bps = [] checked_beds = set([]) for data in items: region_bed = tz.get_in(["depth", "variant_regions", "regions"], data) if region_bed and region_bed not in checked_beds: with utils.open_gzipsafe(region_bed) as in_handle: for r in pybedtools.BedTool(in_handle).intersect(cnv_file): if r.stop - r.start > range_map["target"][0]: target_bps.append(float(r.name)) with utils.open_gzipsafe(region_bed) as in_handle: for r in pybedtools.BedTool(in_handle).intersect(cnv_file, v=True): if r.stop - r.start > range_map["target"][1]: anti_bps.append(float(r.name)) checked_beds.add(region_bed) def scale_in_boundary(raw, round_interval, range_targets): min_val, max_val = range_targets out = int(math.ceil(raw / float(round_interval)) * round_interval) if out > max_val: return max_val elif out < min_val: return min_val else: return out if target_bps and np.median(target_bps) > 0: raw_target_bin = bp_per_bin / float(np.median(target_bps)) target_bin = scale_in_boundary(raw_target_bin, 50, range_map["target"]) else: target_bin = range_map["target"][1] if anti_bps and np.median(anti_bps) > 0: raw_anti_bin = bp_per_bin / float(np.median(anti_bps)) anti_bin = scale_in_boundary(raw_anti_bin, 10000, range_map["antitarget"]) else: anti_bin = range_map["antitarget"][1] return target_bin, anti_bin
def filter_vcf_by_sex(vcf_file, items): """Post-filter a single sample VCF, handling sex chromosomes. Removes Y chromosomes from batches with all female samples. """ out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): genders = list(_configured_ploidy_sex(items)[-1]) is_female = len(genders) == 1 and genders[0] and genders[0] in ["female", "f"] if is_female: orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") with file_transaction(items[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: chrom = chromosome_special_cases(line.split("\t")) if chrom != "Y": out_handle.write(line) if orig_out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) else: out_file = vcf_file return out_file
def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ highdepth_beds = [] from bcbio.variation import bedutils encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for highdepth_bed in highdepth_beds: with utils.open_gzipsafe(highdepth_bed) as in_handle: for line in in_handle: parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def _delly_count_evidence_filter(in_file, data): """Filter delly outputs based on read support (DV) and evidence (split and paired). We require DV > 4 and either both paired end and split read evidence or 5 or more evidence for either individually. """ filtname = "DVSupport" filtdoc = "FMT/DV < 4 || (SR < 1 && PE < 5) || (SR < 5 && PE < 1)" out_file = "%s-filter%s" % utils.splitext_plus(in_file) cur_out_file = out_file.replace(".vcf.gz", ".vcf") if not utils.file_exists(out_file): with file_transaction(data, cur_out_file) as tx_out_file: with utils.open_gzipsafe(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: inp = vcf.Reader(in_handle, in_file) inp.filters["DVSupport"] = vcf.parser._Filter( filtname, filtdoc) outp = vcf.Writer(out_handle, inp) for rec in inp: sr = rec.INFO.get("SR", 0) pe = rec.INFO.get("PE", 0) call = rec.samples[0].data dv = call.DV if hasattr(call, "DV") else 0 if dv < 4 or (sr < 1 and pe < 5) or (sr < 5 and pe < 1): rec.add_filter(filtname) outp.write_record(rec) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(cur_out_file, data["config"]) return out_file
def has_regions(in_file): with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith( ("#", "track", "browser", "@")) and line.strip(): return True return False
def _remove_regions(in_file, remove_beds, ext, data): """Subtract a list of BED files from an input BED. General approach handling none, one and more remove_beds. """ from bcbio.variation import bedutils out_file = "%s-%s.bed" % (utils.splitext_plus(in_file)[0], ext) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with bedtools_tmpdir(data): if len(remove_beds) == 0: to_remove = None elif len(remove_beds) == 1: to_remove = remove_beds[0] else: to_remove = "%s-all.bed" % utils.splitext_plus( tx_out_file)[0] with open(to_remove, "w") as out_handle: for b in remove_beds: with utils.open_gzipsafe(b) as in_handle: for line in in_handle: parts = line.split("\t") out_handle.write( "\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(to_remove): to_remove = bedutils.sort_merge(to_remove, data) if to_remove and utils.file_exists(to_remove): cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove problematic regions: %s" % ext) else: utils.symlink_plus(in_file, out_file) return out_file
def filter_vcf_by_sex(vcf_file, data): """Post-filter a single sample VCF, handling sex chromosomes. Handles sex chromosomes and mitochondrial. Does not try to resolve called hets into potential homozygotes when converting diploid to haploid. Skips filtering on pooled samples, we still need to implement. """ if len(vcfutils.get_samples(vcf_file)) > 1: return vcf_file _, sexes = _configured_ploidy_sex([data]) sex = sexes.pop() out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: line = _fix_line_ploidy(line, sex) if line: out_handle.write(line) if orig_out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def add_umis_to_fastq(out_base, read1_fq, read2_fq, umi_fq, tags=None, cores=1): print("Processing", read1_fq, read2_fq, umi_fq) out1_fq = out_base + "_R1.fq.gz" out2_fq = out_base + "_R2.fq.gz" transform_json_file = out_base + "-transform.json" with open(transform_json_file, "w") as out_handle: if tags: tag1, tag2 = tags out_handle.write(duplex_transform % (tag1, tag1, tag2, tag2)) else: out_handle.write(transform_json) with utils.open_gzipsafe(read1_fq) as in_handle: ex_name = in_handle.readline().split(" ") fastq_tags_arg = "--keep_fastq_tags" if len(ex_name) == 2 else "" tag_arg = "--separate_cb" if tags else "" cmd = ("umis fastqtransform {fastq_tags_arg} {tag_arg} " "--fastq1out >(bgzip --threads {cores} -c > {out1_fq}) " "--fastq2out >(bgzip --threads {cores} -c > {out2_fq}) " "{transform_json_file} {read1_fq} " "{read2_fq}") if umi_fq: cmd += " {umi_fq}" do.run(cmd.format(**locals()), "Add UMIs to paired fastq files") os.remove(transform_json_file)
def _fix_mutect_output(orig_file, config, out_file, is_paired): """Adjust MuTect output to match other callers. - Rename allelic fraction field in mutect output from FA to FREQ to standarize with other tools - Remove extra 'none' samples introduced when calling tumor-only samples """ out_file_noc = out_file.replace(".vcf.gz", ".vcf") none_index = -1 with file_transaction(config, out_file_noc) as tx_out_file: with open_gzipsafe(orig_file) as in_handle: with open(tx_out_file, 'w') as out_handle: for line in in_handle: if not is_paired and line.startswith("#CHROM"): parts = line.rstrip().split("\t") none_index = parts.index("none") del parts[none_index] line = "\t".join(parts) + "\n" elif line.startswith("##FORMAT=<ID=FA"): line = line.replace("=FA", "=FREQ") elif not line.startswith("#"): if none_index > 0: parts = line.rstrip().split("\t") del parts[none_index] line = "\t".join(parts) + "\n" line = line.replace("FA", "FREQ") out_handle.write(line) return bgzip_and_index(out_file_noc, config)
def _filter_by_background(base_samples, back_samples, gt_vcfs, data): """Filter base samples, marking any also present in the background. """ filtname = "InBackground" filtdoc = "Variant also present in background samples with same genotype" for base_name in base_samples: orig_vcf = gt_vcfs[base_name] out_file = "%s-backfilter.vcf" % (utils.splitext_plus(orig_vcf)[0]) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(data, out_file) as tx_out_file: with utils.open_gzipsafe(orig_vcf) as in_handle: with _vcf_readers([gt_vcfs[n] for n in back_samples]) as back_readers: inp = vcf.Reader(in_handle, orig_vcf) inp.filters[filtname] = vcf.parser._Filter(filtname, filtdoc) with open(tx_out_file, "w") as out_handle: outp = vcf.Writer(out_handle, inp) for rec in inp: back_recs = [r.next() for r in back_readers] if _genotype_in_background(rec, back_recs): rec.add_filter(filtname) outp.write_record(rec) if utils.file_exists(out_file + ".gz"): out_file = out_file + ".gz" gt_vcfs[base_name] = vcfutils.bgzip_and_index(out_file, data["config"]) return gt_vcfs
def _run_svtyper(in_file, full_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} --max_reads 1000 -B {full_bam} | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def slim_vcf(in_file, data): """Remove larger annotations which slow down VCF processing """ to_remove = ["ANN", "LOF"] to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove]) in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False) out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): cur_remove = [] with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break elif line.startswith(to_remove_str): cur_id = line.split("ID=")[-1].split(",")[0] cur_remove.append("INFO/%s" % cur_id) with file_transaction(data, out_file) as tx_out_file: if cur_remove: cur_remove = ",".join(cur_remove) cmd = ("bcftools view -f 'PASS,.' {in_file} | " "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}") else: cmd = ("bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}") do.run(cmd.format(**locals()), "Create slim VCF") return out_file
def check_bed_coords(in_file, data): """Ensure BED file coordinates match reference genome. Catches errors like using a hg38 BED file for an hg19 genome run. """ if dd.get_ref_file(data): contig_sizes = {} for contig in ref.file_contigs(dd.get_ref_file(data)): contig_sizes[contig.name] = contig.size with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith( ("#", "track", "browser", "@")) and line.strip(): parts = line.split() if len(parts) > 3: try: end = int(parts[2]) except ValueError: continue contig = parts[0] check_size = contig_sizes.get(contig) if check_size and end > check_size: raise ValueError( "Found BED coordinate off the end of the chromosome:\n%s%s\n" "Is the input BED from the right genome build?" % (line, in_file))
def coverage_interval_from_bed(bed_file): """Calculate a coverage interval for the current region BED. This helps correctly work with cases of uneven coverage across an analysis genome. strelka2 and other model based callers have flags for targeted and non which depend on the local context. """ total_bases = 0 bed_bases = 0 cur_chr = None chr_start = None last_end = None with utils.open_gzipsafe(bed_file) as in_handle: for line in in_handle: parts = line.split() if len(parts) >= 3: chrom, start, end = parts[:3] start = int(start) end = int(end) bed_bases += (end - start) if chrom != cur_chr: if cur_chr and last_end and cur_start is not None: total_bases += (last_end - cur_start) cur_chr = chrom cur_start = int(start) last_end = end if cur_chr and last_end and cur_start is not None: total_bases += (last_end - cur_start) # Should be importing GENOME_COV_THRESH but get circular imports if float(bed_bases) / float(total_bases) >= 0.40: return "genome" else: return "targeted"
def _calculate_comparison_stats(truth_vcf): """Identify calls to validate from the input truth VCF. """ # Avoid very small events for average calculations min_stat_size = 50 min_median_size = 250 sizes = [] svtypes = set([]) with utils.open_gzipsafe(truth_vcf) as in_handle: for call in (l.rstrip().split("\t") for l in in_handle if not l.startswith("#")): stats = _summarize_call(call) if stats["size"] > min_stat_size: sizes.append(stats["size"]) svtypes.add(stats["svtype"]) pct10 = int(np.percentile(sizes, 10)) pct25 = int(np.percentile(sizes, 25)) pct50 = int(np.percentile(sizes, 50)) pct75 = int(np.percentile(sizes, 75)) ranges_detailed = [(int(min(sizes)), pct10), (pct10, pct25), (pct25, pct50), (pct50, pct75), (pct75, max(sizes))] ranges_split = [(int(min(sizes)), pct50), (pct50, max(sizes))] return { "min_size": int(min(sizes) * 0.95), "max_size": int(max(sizes) + 1.05), "svtypes": svtypes, "merge_size": int(np.percentile([x for x in sizes if x > min_median_size], 50)), "ranges": [] }
def _callable_intersect(in_file, callable_bed, data): """Return list of original VCF SVs intersected by callable regions. Does not try to handle BNDs. We should resolve these and return where possible. """ with tx_tmpdir(data) as tmpdir: in_bed = os.path.join( tmpdir, "%s-convert.bed" % utils.splitext_plus(os.path.basename(in_file))[0]) with utils.open_gzipsafe(in_file) as in_handle: with open(in_bed, "w") as out_handle: for parts in (l.split("\t") for l in in_handle if not l.startswith("#")): start, end = _get_start_end(parts) if end: out_handle.write("\t".join([parts[0], start, end] + parts) + "\n") out_file = os.path.join( tmpdir, "%s-subset.tsv" % utils.splitext_plus(os.path.basename(in_file))[0]) cmd = "bedtools intersect -a {in_bed} -b {callable_bed} -wa -wb > {out_file}" do.run(cmd.format(**locals()), "Intersect VCF by callable") with open(out_file) as in_handle: for line in in_handle: yield line.rstrip().split("\t")[3:]
def vcf_has_variants(in_file): if os.path.exists(in_file): with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if line.strip() and not line.startswith("#"): return True return False
def open_fastq(in_file): """ open a fastq file, using gzip if it is gzipped """ if objectstore.is_remote(in_file): return objectstore.open_file(in_file) else: return utils.open_gzipsafe(in_file)
def decorate_problem_regions(query_bed, problem_bed_dir): """ decorate query_bed with percentage covered by BED files of regions specified in the problem_bed_dir """ if is_gzipped(query_bed): stem, _ = os.path.splitext(query_bed) stem, ext = os.path.splitext(stem) else: stem, ext = os.path.splitext(query_bed) out_file = stem + ".problem_annotated" + ext + ".gz" if file_exists(out_file): return out_file bed_files = _find_bed_files(problem_bed_dir) bed_file_string = " ".join(bed_files) names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files] names_string = " ".join(names) with open_gzipsafe(query_bed) as in_handle: header = map(str, in_handle.next().strip().split()) header = "\t".join(header + names) cmd = ( "bedtools annotate -i {query_bed} -files {bed_file_string} " "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Annotate %s with problem regions." % query_bed do.run(cmd.format(**locals()), message) return out_file
def _filter_by_background(base_samples, back_samples, gt_vcfs, data): """Filter base samples, marking any also present in the background. """ filtname = "InBackground" filtdoc = "Variant also present in background samples with same genotype" for base_name in base_samples: orig_vcf = gt_vcfs[base_name] out_file = "%s-backfilter.vcf" % (utils.splitext_plus(orig_vcf)[0]) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(data, out_file) as tx_out_file: with utils.open_gzipsafe(orig_vcf) as in_handle: with _vcf_readers([gt_vcfs[n] for n in back_samples]) as back_readers: inp = vcf.Reader(in_handle, orig_vcf) inp.filters[filtname] = vcf.parser._Filter( filtname, filtdoc) with open(tx_out_file, "w") as out_handle: outp = vcf.Writer(out_handle, inp) for rec in inp: back_recs = [r.next() for r in back_readers] if _genotype_in_background(rec, back_recs): rec.add_filter(filtname) outp.write_record(rec) if utils.file_exists(out_file + ".gz"): out_file = out_file + ".gz" gt_vcfs[base_name] = vcfutils.bgzip_and_index(out_file, data["config"]) return gt_vcfs
def _delly_count_evidence_filter(in_file, data): """Filter delly outputs based on read support (DV) and evidence (split and paired). We require DV > 4 and either both paired end and split read evidence or 5 or more evidence for either individually. """ filtname = "DVSupport" filtdoc = "FMT/DV < 4 || (SR < 1 && PE < 5) || (SR < 5 && PE < 1)" out_file = "%s-filter%s" % utils.splitext_plus(in_file) cur_out_file = out_file.replace(".vcf.gz", ".vcf") if not utils.file_exists(out_file): with file_transaction(data, cur_out_file) as tx_out_file: with utils.open_gzipsafe(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: inp = vcf.Reader(in_handle, in_file) inp.filters["DVSupport"] = vcf.parser._Filter(filtname, filtdoc) outp = vcf.Writer(out_handle, inp) for rec in inp: sr = rec.INFO.get("SR", 0) pe = rec.INFO.get("PE", 0) call = rec.samples[0].data dv = call.DV if hasattr(call, "DV") else 0 if dv < 4 or (sr < 1 and pe < 5) or (sr < 5 and pe < 1): rec.add_filter(filtname) outp.write_record(rec) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(cur_out_file, data["config"]) return out_file
def filter_vcf_by_sex(vcf_file, items): """Post-filter a single sample VCF, handling sex chromosomes. Removes Y chromosomes from batches with all female samples. """ out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): genders = list(_configured_ploidy_sex(items)[-1]) is_female = len(genders) == 1 and genders[0] and genders[0] in [ "female", "f" ] if is_female: orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") with file_transaction(items[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: chrom = chromosome_special_cases( line.split("\t")) if chrom != "Y": out_handle.write(line) if orig_out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) else: out_file = vcf_file return out_file
def _calculate_comparison_stats(truth_vcf): """Identify calls to validate from the input truth VCF. """ sizes = [] svtypes = set([]) with utils.open_gzipsafe(truth_vcf) as in_handle: for call in (l.rstrip().split("\t") for l in in_handle if not l.startswith("#")): stats = _summarize_call(tuple(call[:5] + call[7:8])) sizes.append(stats["size"]) svtypes.add(stats["svtype"]) pct10 = int(np.percentile(sizes, 10)) pct25 = int(np.percentile(sizes, 25)) pct50 = int(np.percentile(sizes, 50)) pct75 = int(np.percentile(sizes, 75)) ranges_detailed = [(int(min(sizes)), pct10), (pct10, pct25), (pct25, pct50), (pct50, pct75), (pct75, max(sizes))] ranges_split = [(int(min(sizes)), pct50), (pct50, max(sizes))] return { "min_size": int(min(sizes) * 0.95), "max_size": int(max(sizes) + 1.05), "svtypes": svtypes, "merge_size": int(np.percentile(sizes, 10)), "ranges": [] }
def tx2genedict(gtf, keep_version=False): """ produce a tx2gene dictionary from a GTF file """ d = {} with open_gzipsafe(gtf) as in_handle: for line in in_handle: if "gene_id" not in line or "transcript_id" not in line: continue geneid = line.split("gene_id")[1].split(" ")[1] geneid = _strip_non_alphanumeric(geneid) if not geneid: continue txid = line.split("transcript_id")[1].split(" ")[1] txid = _strip_non_alphanumeric(txid) if keep_version and "transcript_version" in line: txversion = line.split("transcript_version")[1].split(" ")[1] txversion = _strip_non_alphanumeric(txversion) txid += "." + txversion if has_transcript_version(line) and not keep_version: txid = _strip_feature_version(txid) geneid = _strip_feature_version(geneid) txid = txid.strip() geneid = geneid.strip() if not txid or not geneid: continue d[txid] = geneid return d
def _add_log2_depth(in_file, out_file, data): """Create a CNVkit cnn file with depths http://cnvkit.readthedocs.io/en/stable/fileformats.html?highlight=cnn#target-and-antitarget-bin-level-coverages-cnn """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with utils.open_gzipsafe(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: out_handle.write( "chromosome\tstart\tend\tgene\tlog2\tdepth\n") for line in in_handle: parts = line.rstrip().split() if len(parts) > 4: # Handle inputs unannotated with gene names if len(parts) == 5: chrom, start, end, orig_name, depth = parts gene_name = "." else: assert len(parts) == 6, parts chrom, start, end, orig_name, depth, gene_name = parts depth = float(depth) log2_depth = math.log(float(depth), 2) if depth else -20.0 out_handle.write("%s\t%s\t%s\t%s\t%.3f\t%.2f\n" % (chrom, start, end, gene_name, log2_depth, depth)) return out_file
def _filter_by_bedpe(vcf_file, bedpe_file, data): """Add filters to VCF based on pre-filtered bedpe file. """ out_file = "%s-filter%s" % utils.splitext_plus(vcf_file) nogzip_out_file = out_file.replace(".vcf.gz", ".vcf") if not utils.file_exists(out_file): filters = {} with open(bedpe_file) as in_handle: for line in in_handle: parts = line.split("\t") name = parts[6] cur_filter = parts[-1].strip() if cur_filter != "PASS": filters[name] = cur_filter with file_transaction(nogzip_out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if not line.startswith("#"): parts = line.split("\t") cur_id = parts[2].split("_")[0] cur_filter = filters.get(cur_id, "PASS") if cur_filter != "PASS": parts[6] = cur_filter line = "\t".join(parts) out_handle.write(line) if out_file.endswith(".gz"): vcfutils.bgzip_and_index(nogzip_out_file, data["config"]) return out_file
def _filter_by_bedpe(vcf_file, bedpe_file, data): """Add filters to VCF based on pre-filtered bedpe file. Also removes problem calls in the output VCF with missing alleles. """ out_file = "%s-filter%s" % utils.splitext_plus(vcf_file) nogzip_out_file = out_file.replace(".vcf.gz", ".vcf") if not utils.file_exists(out_file): filters = {} with open(bedpe_file) as in_handle: for line in in_handle: parts = line.split("\t") name = parts[6] cur_filter = parts[-1].strip() if cur_filter != "PASS": filters[name] = cur_filter with file_transaction(data, nogzip_out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if not line.startswith("#"): parts = line.split("\t") # Problem breakends can have empty alleles when at contig ends if not parts[3].strip(): parts[3] = "N" cur_id = parts[2].split("_")[0] cur_filter = filters.get(cur_id, "PASS") if cur_filter != "PASS": parts[6] = cur_filter line = "\t".join(parts) out_handle.write(line) if out_file.endswith(".gz"): vcfutils.bgzip_and_index(nogzip_out_file, data["config"]) return out_file
def slim_vcf(in_file, data): """Remove larger annotations which slow down VCF processing """ to_remove = ["ANN", "LOF"] to_remove_str = tuple(["##INFO=<ID=%s" % x for x in to_remove]) in_file = vcfutils.bgzip_and_index(in_file, data, remove_orig=False) out_file = "%s-slim.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): cur_remove = [] with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break elif line.startswith(to_remove_str): cur_id = line.split("ID=")[-1].split(",")[0] cur_remove.append("INFO/%s" % cur_id) with file_transaction(data, out_file) as tx_out_file: if cur_remove: cur_remove = ",".join(cur_remove) cmd = ( "bcftools view -f 'PASS,.' {in_file} | " "bcftools annotate -x {cur_remove} -O z -o {tx_out_file}") else: cmd = ( "bcftools view -f 'PASS,.' {in_file} -O z -o {tx_out_file}" ) do.run(cmd.format(**locals()), "Create slim VCF") return out_file
def decorate_problem_regions(query_bed, problem_bed_dir): """ decorate query_bed with percentage covered by BED files of regions specified in the problem_bed_dir """ if utils.is_gzipped(query_bed): stem, _ = os.path.splitext(query_bed) stem, ext = os.path.splitext(stem) else: stem, ext = os.path.splitext(query_bed) out_file = stem + ".problem_annotated" + ext + ".gz" if utils.file_exists(out_file): return out_file bed_files = glob.glob(os.path.join(problem_bed_dir, "*.bed")) bed_file_string = " ".join(bed_files) names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files] names_string = " ".join(names) with utils.open_gzipsafe(query_bed) as in_handle: header = map(str, in_handle.next().strip().split()) header = "\t".join(header + names) cmd = ("bedtools annotate -i {query_bed} -files {bed_file_string} " "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}") with file_transaction(out_file) as tx_out_file: message = "Annotate %s with problem regions." % query_bed do.run(cmd.format(**locals()), message) return out_file
def filter_vcf_by_sex(vcf_file, data): """Post-filter a single sample VCF, handling sex chromosomes. Handles sex chromosomes and mitochondrial. Does not try to resolve called hets into potential homozygotes when converting diploid to haploid. Skips filtering on cancer samples. Since these will be pooled, need special functionality to handle them """ if vcfutils.get_paired_phenotype(data): return vcf_file _, sexes = _configured_ploidy_sex([data]) sex = sexes.pop() out_file = "%s-ploidyfix%s" % utils.splitext_plus(vcf_file) if not utils.file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if line.startswith("#"): out_handle.write(line) else: line = _fix_line_ploidy(line, sex) if line: out_handle.write(line) if orig_out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def get_normal_sample(in_file): """Retrieve normal sample if normal/turmor """ with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if line.startswith("##PEDIGREE"): parts = line.strip().split("Original=")[1][:-1] return parts
def vcf_has_nonfiltered_variants(in_file): if os.path.exists(in_file): with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if line.strip() and not line.startswith("#"): parts = line.split("\t") if parts[6] in set(["PASS", "."]): return True return False
def get_samples(in_file): """Retrieve samples present in a VCF file """ with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if line.startswith("#CHROM"): parts = line.strip().split("\t") return parts[9:] raise ValueError("Did not find sample header in VCF file %s" % in_file)
def _subset_to_variant_regions(callable_file, variant_regions, data): """Subset output callable file to only variant regions of interest. """ out_file = "%s-vrsubset.bed" % utils.splitext_plus(callable_file)[0] if not utils.file_uptodate(out_file, callable_file): with file_transaction(data, out_file) as tx_out_file: with utils.open_gzipsafe(callable_file) as in_handle: pybedtools.BedTool(in_handle).intersect(variant_regions).saveas(tx_out_file) return out_file
def _find_filtered(fname, extra): """Identify the filtered inputs in the original VCF file. """ filtered = 0 with utils.open_gzipsafe(fname) as in_handle: for rec in vcf.Reader(in_handle, fname): if "LowPriority" in rec.FILTER: filtered += 1 enrichment = "%sx" % (int((extra + filtered) / float(extra))) return enrichment, filtered
def is_gene_list(bed_file): """Check if the file is only a list of genes, not a BED """ with utils.open_gzipsafe(bed_file) as in_handle: for line in in_handle: if not line.startswith("#"): if len(line.split()) == 1: return True else: return False
def _vcf_readers(vcf_files): handles = [] readers = [] for vcf_file in vcf_files: in_handle = utils.open_gzipsafe(vcf_file) handles.append(in_handle) readers.append(vcf.Reader(in_handle, vcf_file)) yield readers for handle in handles: handle.close()
def _vcf_to_bed(in_file, caller, out_file): if in_file.endswith((".vcf", "vcf.gz")): with utils.open_gzipsafe(in_file) as in_handle: with open(out_file, "w") as out_handle: for rec in vcf.Reader(in_handle, in_file): if not rec.FILTER: if not (hasattr(rec.samples[0].data, "FT") and rec.samples[0].data.FT): out_handle.write("\t".join([rec.CHROM, str(rec.start - 1), str(rec.INFO["END"]), "%s_%s" % (rec.INFO["SVTYPE"], caller)]) + "\n")
def _average_called_depth(in_file): """Retrieve the average depth of called reads in the provided VCF. """ depths = [] with utils.open_gzipsafe(in_file) as in_handle: reader = vcf.Reader(in_handle, in_file) for rec in reader: d = rec.INFO.get("DP") if d is not None: depths.append(d) return int(math.ceil(numpy.mean(depths)))
def _add_umis_with_fastp(read_fq, umi_fq, out_fq, cores): """Add UMIs to reads from separate UMI file using fastp. """ with utils.open_gzipsafe(umi_fq) as in_handle: in_handle.readline() # name umi_size = len(in_handle.readline().strip()) cmd = ("fastp -Q -A -L -G -w 1 --in1 {read_fq} --in2 {umi_fq} " "--umi --umi_prefix UMI --umi_loc read2 --umi_len {umi_size} " "--out1 >(bgzip --threads {cores} -c > {out_fq}) --out2 /dev/null " "-j /dev/null -h /dev/null") do.run(cmd.format(**locals()), "Add UMIs to fastq file with fastp")
def _add_contig_cl(in_file, items): has_contigs = False with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if line.startswith("##contig"): has_contigs = True break elif not line.startswith("##"): break if not has_contigs: return vcfutils.add_contig_to_header_cl(items[0])
def _is_small_vcf(vcf_file): """Check for small VCFs which we want to analyze quicker. """ count = 0 small_thresh = 250 with utils.open_gzipsafe(vcf_file) as in_handle: for line in in_handle: if not line.startswith("#"): count += 1 if count > small_thresh: return False return True
def check_bed_contigs(in_file, data): """Ensure BED file contigs match the reference genome. """ contigs = set([]) with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith(("#", "track", "browser")) and line.strip(): contigs.add(line.split()[0]) ref_contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))]) if len(contigs - ref_contigs) / float(len(contigs)) > 0.25: raise ValueError("Contigs in BED file %s not in reference genome:\n %s\n" % (in_file, list(contigs - ref_contigs)) + "This is typically due to chr1 versus 1 differences in BED file and reference.")
def _civic_regions(civic_file, variant_types=None, diseases=None, drugs=None): """Retrieve gene regions and names filtered by variant_types and diseases. """ if isinstance(diseases, six.string_types): diseases = [diseases] with utils.open_gzipsafe(civic_file) as in_handle: reader = csv.reader(in_handle, delimiter="\t") for chrom, start, end, info_str in reader: info = edn_loads(info_str) if not variant_types or _matches(info["support"]["variants"], variant_types): if not diseases or _matches(info["support"]["diseases"], diseases): if not drugs or _matches(info["support"]["drugs"], drugs): yield (chrom, int(start), int(end), list(info["name"])[0])