def get_contigs(data): contigs = [x.name for x in shared.get_noalt_contigs(data)] keep = [ x for x in contigs if chromhacks.is_autosomal(x) or chromhacks.is_sex(x) ] return keep
def _get_autosomal_bed(data, base_file): out_file = "%s-stdchroms.bed" % utils.splitext_plus(base_file)[0] with open(out_file, "w") as out_handle: for r in ref.file_contigs(dd.get_ref_file(data)): if chromhacks.is_autosomal(r.name): out_handle.write("%s\t0\t%s\n" % (r.name, r.size)) return out_file
def prep_vrn_file(in_file, vcaller, work_dir, somatic_info, writer_class, seg_file=None, params=None): """Select heterozygous variants in the normal sample with sufficient depth. writer_class implements write_header and write_row to write VCF outputs from a record and extracted tumor/normal statistics. """ data = somatic_info.tumor_data if not params: params = PARAMS out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): # ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info) ready_bed = None if ready_bed and utils.file_exists(ready_bed): sub_file = _create_subset_file(in_file, ready_bed, work_dir, data) else: sub_file = in_file max_depth = max_normal_germline_depth(sub_file, params, somatic_info) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = writer_class(out_handle) writer.write_header() bcf_in = pysam.VariantFile(sub_file) for rec in bcf_in: stats = _is_possible_loh(rec, bcf_in, params, somatic_info, max_normal_depth=max_depth) if chromhacks.is_autosomal(rec.chrom) and stats is not None: writer.write_row(rec, stats) return out_file
def _prep_vrn_file(in_file, vcaller, seg_file, work_dir, somatic_info): """Select heterozygous variants in the normal sample with sufficient depth. """ data = somatic_info.tumor_data params = {"min_freq": 0.4, "max_freq": 0.6, "tumor_only": {"min_freq": 0.10, "max_freq": 0.90}, "min_depth": 20, "hetblock": {"min_alleles": 25, "allowed_misses": 2}} out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): #ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info) ready_bed = None if ready_bed and utils.file_exists(ready_bed): sub_file = _create_subset_file(in_file, ready_bed, work_dir, data) else: sub_file = in_file with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["chrom", "start", "end", "freq"]) bcf_in = pysam.VariantFile(sub_file) for rec in bcf_in: tumor_freq = _is_possible_loh(rec, bcf_in, params, somatic_info) if chromhacks.is_autosomal(rec.chrom) and tumor_freq is not None: writer.writerow([_to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq]) return out_file
def _prep_cnv_file(cns_file, svcaller, work_dir, data): """Create a CSV file of CNV calls with log2 and number of marks. """ in_file = cns_file out_file = os.path.join( work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], svcaller)) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: reader = csv.reader(in_handle, dialect="excel-tab") writer = csv.writer(out_handle) writer.writerow( ["chrom", "start", "end", "num.mark", "seg.mean"]) header = next(reader) for line in reader: cur = dict(zip(header, line)) if chromhacks.is_autosomal(cur["chromosome"]): writer.writerow([ _to_ucsc_style(cur["chromosome"]), cur["start"], cur["end"], cur["probes"], cur["log2"] ]) return out_file
def _freqs_by_chromosome(in_file, params, somatic_info): """Retrieve frequencies across each chromosome as inputs to HMM. """ freqs = [] coords = [] cur_chrom = None with pysam.VariantFile(in_file) as bcf_in: for rec in bcf_in: if _is_biallelic_snp(rec) and _passes_plus_germline( rec) and chromhacks.is_autosomal(rec.chrom): if cur_chrom is None or rec.chrom != cur_chrom: if cur_chrom and len(freqs) > 0: yield cur_chrom, freqs, coords cur_chrom = rec.chrom freqs = [] coords = [] stats = _tumor_normal_stats(rec, somatic_info) if tz.get_in(["tumor", "depth"], stats, 0) > params["min_depth"]: # not a ref only call if len(rec.samples) == 0 or sum(rec.samples[ somatic_info.tumor_name].allele_indices) > 0: freqs.append(tz.get_in(["tumor", "freq"], stats)) coords.append(rec.start) if cur_chrom and len(freqs) > 0: yield cur_chrom, freqs, coords
def prep_vrn_file(in_file, vcaller, work_dir, somatic_info, writer_class, seg_file=None, params=None): """Select heterozygous variants in the normal sample with sufficient depth. writer_class implements write_header and write_row to write VCF outputs from a record and extracted tumor/normal statistics. """ data = somatic_info.tumor_data if not params: params = PARAMS out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): # ready_bed = _identify_heterogeneity_blocks_seg(in_file, seg_file, params, work_dir, somatic_info) ready_bed = None if ready_bed and utils.file_exists(ready_bed): sub_file = _create_subset_file(in_file, ready_bed, work_dir, data) else: sub_file = in_file with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = writer_class(out_handle) writer.write_header() bcf_in = pysam.VariantFile(sub_file) for rec in bcf_in: stats = _is_possible_loh(rec, bcf_in, params, somatic_info) if chromhacks.is_autosomal(rec.chrom) and stats is not None: writer.write_row(rec, stats) return out_file
def _prep_cnv_file(cns_file, svcaller, calls_by_name, work_dir, data): """Create a CSV file of CNV calls with log2 and number of marks. """ in_file = theta.subset_by_supported(cns_file, _cns_to_coords, calls_by_name, work_dir, data, headers=("chromosome", "#")) out_file = os.path.join( work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], svcaller)) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: reader = csv.reader(in_handle, dialect="excel-tab") writer = csv.writer(out_handle) writer.writerow( ["chrom", "start", "end", "num.mark", "seg.mean"]) reader.next() # header for chrom, start, end, _, log2, probes in reader: if chromhacks.is_autosomal(chrom): writer.writerow([ _to_ucsc_style(chrom), start, end, probes, log2 ]) return out_file
def _prep_vrn_file(in_file, vcaller, work_dir, calls_by_name, somatic_info): """Select heterozygous variants in the normal sample with sufficient depth. """ data = somatic_info.tumor_data params = { "min_freq": 0.4, "max_freq": 0.6, "min_depth": 15, "hetblock": { "min_alleles": 25, "allowed_misses": 2 } } out_file = os.path.join( work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): ready_bed = _identify_heterogenity_blocks(in_file, params, work_dir, somatic_info) #ready_bed = _remove_sv_calls(het_bed, calls_by_name, somatic_info.tumor_data) sub_file = _create_subset_file(in_file, ready_bed, work_dir, data) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["chrom", "start", "end", "freq"]) bcf_in = pysam.VariantFile(sub_file) for rec in bcf_in: tumor_freq = _is_possible_loh(rec, params, somatic_info) if chromhacks.is_autosomal( rec.chrom) and tumor_freq is not None: writer.writerow([ _to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq ]) return out_file
def coverage_interval_from_bed(bed_file, per_chrom=True): """Calculate a coverage interval for the current region BED. This helps correctly work with cases of uneven coverage across an analysis genome. strelka2 and other model based callers have flags for targeted and non which depend on the local context. Checks coverage per chromosome, avoiding non-standard chromosomes, if per_chrom is set. Otherwise does a global check over all regions. The global check performs better for strelka2 but not for DeepVariant: https://github.com/bcbio/bcbio_validations/tree/master/deepvariant#deepvariant-v06-release-strelka2-stratification-and-initial-gatk-cnn """ total_starts = {} total_ends = {} bed_bases = collections.defaultdict(int) with utils.open_gzipsafe(bed_file) as in_handle: for line in in_handle: parts = line.split() if len(parts) >= 3: chrom, start, end = parts[:3] if chromhacks.is_autosomal(chrom): start = int(start) end = int(end) bed_bases[chrom] += (end - start) total_starts[chrom] = min( [start, total_starts.get(chrom, sys.maxsize)]) total_ends[chrom] = max([end, total_ends.get(chrom, 0)]) # can check per chromosome -- any one chromosome with larger, or over all regions if per_chrom: freqs = [ float(bed_bases[c]) / float(total_ends[c] - total_starts[c]) for c in sorted(bed_bases.keys()) ] elif len(bed_bases) > 0: freqs = [ sum([bed_bases[c] for c in sorted(bed_bases.keys())]) / sum([ float(total_ends[c] - total_starts[c]) for c in sorted(bed_bases.keys()) ]) ] else: freqs = [] # Should be importing GENOME_COV_THRESH but get circular imports if any([f >= 0.40 for f in freqs]): return "genome" else: return "targeted"
def _prep_cnv_file(cns_file, svcaller, work_dir, data): """Create a CSV file of CNV calls with log2 and number of marks. """ in_file = cns_file out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], svcaller)) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: reader = csv.reader(in_handle, dialect="excel-tab") writer = csv.writer(out_handle) writer.writerow(["chrom", "start", "end", "num.mark", "seg.mean"]) reader.next() # header for chrom, start, end, _, log2, probes in (xs[:6] for xs in reader): if chromhacks.is_autosomal(chrom): writer.writerow([_to_ucsc_style(chrom), start, end, probes, log2]) return out_file
def _freqs_by_chromosome(in_file, params, somatic_info): """Retrieve frequencies across each chromosome as inputs to HMM. """ chroms = [] freqs = [] coords = [] with pysam.VariantFile(in_file) as bcf_in: for rec in bcf_in: if _is_biallelic_snp(rec) and chromhacks.is_autosomal(rec.chrom): if len(chroms) == 0 or rec.chrom != chroms[-1]: chroms.append(rec.chrom) freqs.append([]) coords.append([]) stats = _tumor_normal_stats(rec, somatic_info) if tz.get_in(["tumor", "depth"], stats, 0) > params["min_depth"]: # not a ref only call if sum(rec.samples[somatic_info.tumor_name].allele_indices) > 0: freqs[-1].append(tz.get_in(["tumor", "freq"], stats)) coords[-1].append(rec.start) return chroms, freqs, coords
def _prep_cnv_file(cns_file, svcaller, work_dir, data): """Create a CSV file of CNV calls with log2 and number of marks. """ in_file = cns_file out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], svcaller)) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: reader = csv.reader(in_handle, dialect="excel-tab") writer = csv.writer(out_handle) writer.writerow(["chrom", "start", "end", "num.mark", "seg.mean"]) header = next(reader) for line in reader: cur = dict(zip(header, line)) if chromhacks.is_autosomal(cur["chromosome"]): writer.writerow([_to_ucsc_style(cur["chromosome"]), cur["start"], cur["end"], cur["probes"], cur["log2"]]) return out_file
def _prep_vrn_file(in_file, vcaller, work_dir, somatic_info): """Select heterozygous variants in the normal sample with sufficient depth. """ data = somatic_info.tumor_data params = {"min_freq": 0.4, "max_freq": 0.6, "min_depth": 15} out_file = os.path.join(work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): sub_file = _create_subset_file(in_file, work_dir, data) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["chrom", "start", "end", "freq"]) bcf_in = VariantFile(sub_file) for rec in bcf_in: tumor_freq = _is_possible_loh(rec, params, somatic_info) if chromhacks.is_autosomal(rec.chrom) and tumor_freq is not None: writer.writerow([_to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq]) return out_file
def coverage_interval_from_bed(bed_file, per_chrom=True): """Calculate a coverage interval for the current region BED. This helps correctly work with cases of uneven coverage across an analysis genome. strelka2 and other model based callers have flags for targeted and non which depend on the local context. Checks coverage per chromosome, avoiding non-standard chromosomes, if per_chrom is set. Otherwise does a global check over all regions. The global check performs better for strelka2 but not for DeepVariant: https://github.com/bcbio/bcbio_validations/tree/master/deepvariant#deepvariant-v06-release-strelka2-stratification-and-initial-gatk-cnn """ total_starts = {} total_ends = {} bed_bases = collections.defaultdict(int) with utils.open_gzipsafe(bed_file) as in_handle: for line in in_handle: parts = line.split() if len(parts) >= 3: chrom, start, end = parts[:3] if chromhacks.is_autosomal(chrom): start = int(start) end = int(end) bed_bases[chrom] += (end - start) total_starts[chrom] = min([start, total_starts.get(chrom, sys.maxsize)]) total_ends[chrom] = max([end, total_ends.get(chrom, 0)]) # can check per chromosome -- any one chromosome with larger, or over all regions if per_chrom: freqs = [float(bed_bases[c]) / float(total_ends[c] - total_starts[c]) for c in sorted(bed_bases.keys())] elif len(bed_bases) > 0: freqs = [sum([bed_bases[c] for c in sorted(bed_bases.keys())]) / sum([float(total_ends[c] - total_starts[c]) for c in sorted(bed_bases.keys())])] else: freqs = [] # Should be importing GENOME_COV_THRESH but get circular imports if any([f >= 0.40 for f in freqs]): return "genome" else: return "targeted"
def _freqs_by_chromosome(in_file, params, somatic_info): """Retrieve frequencies across each chromosome as inputs to HMM. """ chroms = [] freqs = [] coords = [] with pysam.VariantFile(in_file) as bcf_in: for rec in bcf_in: if _is_biallelic_snp(rec) and chromhacks.is_autosomal(rec.chrom): if len(chroms) == 0 or rec.chrom != chroms[-1]: chroms.append(rec.chrom) freqs.append([]) coords.append([]) stats = _tumor_normal_stats(rec, somatic_info) if tz.get_in(["normal", "depth"], stats, 0) > params["min_depth"]: # not a ref only call if sum(rec.samples[ somatic_info.tumor_name].allele_indices) > 0: freqs[-1].append(tz.get_in(["normal", "freq"], stats)) coords[-1].append(rec.start) return chroms, freqs, coords
def _freqs_by_chromosome(in_file, params, somatic_info): """Retrieve frequencies across each chromosome as inputs to HMM. """ freqs = [] coords = [] cur_chrom = None with pysam.VariantFile(in_file) as bcf_in: for rec in bcf_in: if _is_biallelic_snp(rec) and _passes_plus_germline(rec) and chromhacks.is_autosomal(rec.chrom): if cur_chrom is None or rec.chrom != cur_chrom: if cur_chrom and len(freqs) > 0: yield cur_chrom, freqs, coords cur_chrom = rec.chrom freqs = [] coords = [] stats = _tumor_normal_stats(rec, somatic_info) if tz.get_in(["tumor", "depth"], stats, 0) > params["min_depth"]: # not a ref only call if len(rec.samples) == 0 or sum(rec.samples[somatic_info.tumor_name].allele_indices) > 0: freqs.append(tz.get_in(["tumor", "freq"], stats)) coords.append(rec.start) if cur_chrom and len(freqs) > 0: yield cur_chrom, freqs, coords