def sort_length(b1, b2): """ Order entries from longest to shortest SVLEN, ties are by alphanumeric of REF """ s1 = truvari.entry_size(b1) s2 = truvari.entry_size(b2) if s1 < s2: return 1 if s1 > s2: return -1 if b1.ref < b2.ref: return 1 if b1.ref > b2.ref: return -1 return 0
def filter_call(self, entry, base=False): """ Returns True if the call should be filtered Base has different filtering requirements, so let the method know """ size = truvari.entry_size(entry) if size > self.params.sizemax: return True if base and size < self.params.sizemin: return True if not base and size < self.params.sizefilt: return True samp = self.params.bSample if base else self.params.cSample prefix = 'b' if base else 'c' if self.params.no_ref in [ "a", prefix ] and not truvari.entry_is_present(entry, samp): return True if self.params.passonly and truvari.entry_is_filtered(entry): return True return False
def run(self): """ Find neighbors through a vcf """ last_pos = None for entry in self.in_vcf: size = truvari.entry_size(entry) if not last_pos: last_pos = [entry.chrom, entry.start] if last_pos[0] == entry.chrom and last_pos[1] > entry.start: logging.error("File is not sorted %s:%d before %s:%d", last_pos[0], last_pos[1], entry.chrom, entry.start) sys.exit(1) if entry.chrom != last_pos[0]: self.chrom_end_flush() self.neigh_id += 1 last_pos = [entry.chrom, entry.start] if size < self.sizemin or (self.passonly and truvari.filter_value(entry)): self.out_vcf.write(entry) continue # Make new range start, end = truvari.entry_boundaries(entry) cur_range = [start, end, entry, 0] self.flush_push_stack(cur_range) self.chrom_end_flush()
def process_entries(ref_section): """ Process vcf lines from a reference section """ chrom, start, stop = ref_section logging.debug(f"Starting region {chrom}:{start}-{stop}") setproctitle(f"trf {chrom}:{start}-{stop}") vcf = pysam.VariantFile(trfshared.args.input) to_consider = [] for entry in vcf.fetch(chrom, start, stop): # Prevent duplication if not (entry.start >= start and entry.start < stop): continue if truvari.entry_size(entry) >= trfshared.args.min_length: to_consider.append(entry) if not to_consider: return (chrom, start, stop, "") tanno = TRFAnno(executable=trfshared.args.executable, trf_params=trfshared.args.trf_params) tanno.run_trf(to_consider) v = pysam.VariantFile(trfshared.args.input) new_header = edit_header(v.header) out = StringIO() decimal.getcontext().prec = 1 for entry in v.fetch(chrom, start, stop): # Prevent duplication if not (entry.start >= start and entry.start < stop): continue if truvari.entry_size(entry) >= trfshared.args.min_length: key = f"{entry.chrom}:{entry.start}-{entry.stop}.{hash(entry.alts[0])}" entry = tanno.annotate(entry, key, new_header) out.write(str(entry)) out.seek(0) setproctitle(f"trf done {chrom}:{start}-{stop}") logging.debug(f"Done region {chrom}:{start}-{stop}") return (chrom, start, stop, out.read())
def annotate_entry(self, entry): """ Annotates entries in the vcf and writes to new vcf """ if truvari.entry_size(entry) >= self.min_length: entry.translate(self.n_header) remap, hits = self.remap_entry(entry) entry.info["REMAP"] = remap if self.anno_hits and hits: entry.info["REMAPHits"] = [ _[1] for _ in hits[-self.anno_hits:] ] return entry
def annotate_entry(self, entry, hits): """ Annotates a single entry with given hits """ best_hit_pct = 0 best_hit = None entry_size = truvari.entry_size(entry) for hit in hits: size_aln = abs(hit["RM_qstart"] - hit["RM_qend"]) + 1 pct = size_aln / entry_size # The TR that covers the most of the sequence # I'm taking the single best... So I might be 'under annotating' # Also, I might want to consider the score? if pct >= self.threshold and pct > best_hit_pct: best_hit_pct = pct best_hit = hit return self.edit_entry(entry, best_hit)
def svinfo_main(cmdargs): """ Main method """ args = parse_args(cmdargs) vcf = pysam.VariantFile(args.input) n_header = edit_header(vcf) with pysam.VariantFile(args.output, 'w', header=n_header) as out: for entry in vcf: sz = truvari.entry_size(entry) if sz >= args.minsize: entry.translate(n_header) svtype = truvari.entry_variant_type(entry) entry.info["SVTYPE"] = svtype entry.info["SVLEN"] = sz out.write(entry) logging.info("Finished svinfo")
def get_pct(chrom, start, end): tot = 0 homs = 0 for entry in v.fetch( chrom, max(0, start - args.buffer), min(v.header.contigs[chrom].length, end + args.buffer)): if truvari.entry_size(entry) > args.maxgt: continue if truvari.get_gt(entry.samples[0]["GT"]).name == "HOM": homs += 1 tot += 1 if tot < args.mincount: return None if tot == 0: return float('nan') return float(format((homs / tot) * 100, ".1f"))
def bpovl_main(cmdargs): """ Main method """ args = parse_args(cmdargs) in_vcf = pysam.VariantFile(args.input) anno_tree, anno_cnt = truvari.build_anno_tree(args.anno, *args.anno_psets) out_rows = [] logging.info("Loaded %d annotations", anno_cnt) hit_cnt = 0 for entry in in_vcf: has_hit = False start, end = truvari.entry_boundaries(entry) span = abs(end - start) if span > args.spanmax: continue svlen = truvari.entry_size(entry) if svlen < args.sizemin: continue key = truvari.entry_to_key(entry) for anno_idx in anno_tree[entry.chrom].at(start): has_hit = True out_rows.append([key, 'start_bnd', anno_idx.data]) for anno_idx in anno_tree[entry.chrom].at(end): has_hit = True out_rows.append([key, 'end_bnd', anno_idx.data]) for anno_idx in anno_tree[entry.chrom].overlap(start, end): if start <= anno_idx.begin and anno_idx.end <= end: has_hit = True out_rows.append([key, 'overlaps', anno_idx.data]) elif anno_idx.begin <= start and end <= anno_idx.end: has_hit = True out_rows.append([key, 'contains', anno_idx.data]) hit_cnt += has_hit logging.info("%d SVs hit annotations", hit_cnt) out = pd.DataFrame(out_rows, columns=["vcf_key", "intersection", "anno_key"]) joblib.dump(out, args.output) logging.info("Finished bpovl")
def hompct_main(cmd_args): """ Main """ args = parse_args(cmd_args) v = pysam.VariantFile(args.input) def get_pct(chrom, start, end): tot = 0 homs = 0 for entry in v.fetch( chrom, max(0, start - args.buffer), min(v.header.contigs[chrom].length, end + args.buffer)): if truvari.entry_size(entry) > args.maxgt: continue if truvari.get_gt(entry.samples[0]["GT"]).name == "HOM": homs += 1 tot += 1 if tot < args.mincount: return None if tot == 0: return float('nan') return float(format((homs / tot) * 100, ".1f")) header = v.header.copy() header.add_line(( '##INFO=<ID=HOMPCT,Number=1,Type=Float,' # pylint: disable=consider-using-f-string 'Description="Percent of calls < %dbp long within %dbp that are homozygous">' ) % (args.maxgt, args.buffer)) out = pysam.VariantFile(args.output, 'w', header=header) v2 = pysam.VariantFile(args.input) for entry in v2: if truvari.entry_size(entry) >= args.minanno: entry.translate(header) anno = get_pct(entry.chrom, *truvari.entry_boundaries(entry)) if anno is not None: entry.info["HOMPCT"] = anno out.write(entry) logging.info("Finished hompct")
def run(self): """ The work """ logging.info("Annotating VCF") # should probably 'batch' it instead of running individually with pysam.VariantFile(self.in_vcf) as vcf, \ pysam.VariantFile(self.out_vcf, 'w', header=self.n_header) as output: for entry in vcf: sz = truvari.entry_size(entry) if sz < self.min_length: output.write(entry) continue svtype = truvari.entry_variant_type(entry) if svtype == "INS": entry = self.annotate_entry(entry, entry.alts[0]) elif svtype == "DEL": entry = self.annotate_entry(entry, entry.ref) output.write(entry)
def output_writer(call, outs, sizemin): """ Annotate a MatchResults' entries, write to the apppropriate file in outs and do the stats counting. Writer is responsible for handling FPs between sizefilt-sizemin """ box = outs["stats_box"] if call.base: box["base cnt"] += 1 annotate_entry(call.base, call, outs['n_base_header']) if call.state: gtBase = str(call.base_gt) gtComp = str(call.comp_gt) box["gt_matrix"][gtBase][gtComp] += 1 box["TP-base"] += 1 outs["tpb_out"].write(call.base) if call.gt_match: box["TP-base_TP-gt"] += 1 else: box["TP-base_FP-gt"] += 1 else: box["FN"] += 1 outs["fn_out"].write(call.base) if call.comp: annotate_entry(call.comp, call, outs['n_comp_header']) if call.state: box["call cnt"] += 1 box["TP-call"] += 1 outs["tpc_out"].write(call.comp) if call.gt_match: box["TP-call_TP-gt"] += 1 else: box["TP-call_FP-gt"] += 1 elif truvari.entry_size(call.comp) >= sizemin: # The if is because we don't count FPs between sizefilt-sizemin box["call cnt"] += 1 box["FP"] += 1 outs["fp_out"].write(call.comp)
def extract_seqs(self): """ Create the fasta file of all the sequences Returns the fasta file name """ ret = tempfile.NamedTemporaryFile(mode='w', delete=False) # pylint: disable=consider-using-with tot_cnt = 0 cnt = 0 cntbp = 0 with pysam.VariantFile(self.in_vcf) as fh: for pos, entry in enumerate(fh): tot_cnt += 1 entry_size = truvari.entry_size(entry) if self.min_length <= entry_size <= self.max_length: cnt += 1 cntbp += entry_size if truvari.entry_variant_type(entry) == "INS": ret.write(f">{pos}\n{entry.alts[0]}\n") else: ret.write(f">{pos}\n{entry.ref}\n") logging.info( f"Extracted {cnt} sequences ({cntbp}bp) from {tot_cnt} entries") return ret.name
def generate_stat_table(vcf_fn, args): """ Given a vcf filename, create a numpy array with dimensions counting [SVTYPE, SZBINS, GT, QUALBINS] """ vcf = pysam.VariantFile(vcf_fn) ret = {} for i in vcf.header.samples: ret[i] = numpy.zeros((len(SV), len(SZBINS), len(QUALBINS), len(GT))) ret["total"] = numpy.zeros((len(SV), len(SZBINS), len(QUALBINS))) for entry in vcf: sv = get_svtype(truvari.entry_variant_type(entry)) sz = get_sizebin(truvari.entry_size(entry)) if entry.qual is not None: qual, idx = get_scalebin(entry.qual, args.qmin, args.qmax) else: qual, idx = 0, 0 for i in vcf.header.samples: gt = get_gt(entry.samples[i]["GT"]) ret[i][sv.value, SZBINS.index(sz), idx, gt.value] += 1 ret["total"][sv.value, SZBINS.index(sz), idx] += 1 return ret
def vcf_to_df(fn, with_info=True, with_fmt=True, sample=None): """ Parse a vcf file and turn it into a dataframe. Tries its best to pull info/format tags from the sample information. For Formats with Number=G, append _ref, _het, _hom. For things with Number=R, append _ref, _alt. Specify which sample with its name or index in the VCF. :param `fn`: File name of VCF to open and turn into a DataFrame :type `fn`: string :param `with_info`: Add the INFO fields from the VCF to the DataFrame columns :type `with_info`: boolean, optional :param `with_fmt`: Add the FORMAT fields from the VCF to the DataFrame columns :type `with_info`: boolean, optional :param `sample`: Sample from the VCF to parse. Only used when with_fmt==True :type `sample`: int/string, optional :return: Converted VCF :rtype: pandas.DataFrame Example >>> import truvari >>> df = truvari.vcf_to_df("repo_utils/test_files/input2.vcf.gz", True, True) >>> df.columns Index(['id', 'svtype', 'svlen', 'szbin', 'qual', 'filter', 'is_pass', 'QNAME', 'QSTART', 'QSTRAND', 'SVTYPE', 'SVLEN', 'GT', 'PL_ref', 'PL_het', 'PL_hom', 'AD_ref', 'AD_alt'], dtype='object') """ v = pysam.VariantFile(fn) header = [ "key", "id", "svtype", "svlen", "szbin", "qual", "filter", "is_pass" ] info_ops = [] if with_info: info_header, info_ops = tags_to_ops(v.header.info.items()) logging.debug(info_header) header.extend(info_header) fmt_ops = [] if with_fmt: # get all the format fields, and how to parse them from header, add them to the header fmt_header, fmt_ops = tags_to_ops(v.header.formats.items()) logging.debug(fmt_header) if isinstance(sample, list): header.extend( [f'{s}_{f}' for s, f in itertools.product(sample, fmt_header)]) else: header.extend(fmt_header) if sample is None: sample = v.header.samples[0] if not isinstance(sample, list): sample = [sample] else: sample = [] rows = [] for entry in v: varsize = truvari.entry_size(entry) filt = list(entry.filter) cur_row = [ truvari.entry_to_key(entry), entry.id, truvari.entry_variant_type(entry), varsize, truvari.get_sizebin(varsize), entry.qual, filt, not filt or filt[0] == 'PASS' ] for i, op in info_ops: # Need to make OPs for INFOS.. cur_row.extend(op(entry.info, i)) for samp in sample: for i, op in fmt_ops: cur_row.extend(op(entry.samples[samp], i)) rows.append(cur_row) ret = pd.DataFrame(rows, columns=header) ret["szbin"] = ret["szbin"].astype(SZBINTYPE) ret["svtype"] = ret["svtype"].astype(SVTYTYPE) return ret.set_index("key")