def _af_filter(data, in_file, out_file): """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER) """ min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) # GATK 3.x can produce VCFs without sample names for empty VCFs try: tumor_index = vcf.samples.index(dd.get_sample_name(data)) except ValueError: tumor_index = None for rec in vcf: if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def _af_filter(data, in_file, out_file): """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER) """ min_freq = float( utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.info( "Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists( ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq * 100) + ('(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)' ) }) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if np.all(rec.format('AF')[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def mark_missing_sites(vcffile, region, missing_threshold, soft_filter): vcf = VCF(vcffile) header_param_id = { 'ID': 'MISSING', 'Description': 'failed variant site missingness threshold ({} %)'.format( missing_threshold) } header_param_info = { 'ID': 'MISSINGPCT', 'Description': 'site missingness percentage', 'Type': 'Float', 'Number': '1' } vcf.add_filter_to_header(header_param_id) vcf.add_info_to_header(header_param_info) out = Writer('-', vcf) (total_sites, noted_sites) = (0, 0) for variant in vcf(region): total_sites += 1 (missing_pct, missing, total) = compute_missingness(variant) verdict = variant_missing_criteria(missing_threshold, missing_pct) variant = update_variant(variant, verdict, missing_pct) if verdict == "pass": noted_sites += 1 out.write_record(variant) elif verdict == "fail" and soft_filter: out.write_record(variant) out.close() msg = "After filtering, passed {} out of a possible {} Sites ({})" msg = msg.format(noted_sites, total_sites, 'pass') print(msg, file=sys.stderr)
def setUp(self): # load test data # store each variant object into specific variables for tes test_directory = os.path.dirname(os.path.abspath(__file__)) reader = VCF(os.path.join(test_directory, "test.vcf")) self.test_filter = refilter.Filter(0.3, 0.7, 'AB', 'VAR_DP', 5, ['MISSING'], ['DB']) reader.add_filter_to_header(self.test_filter.filtered_header()) reader.add_info_to_header(self.test_filter.rescued_header()) self.variants = [ variant for variant in reader ]
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.info("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1) else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def main(min_allele_balance, max_allele_balance, allele_balance_tag, variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields, vcf): reader = VCF(vcf) refilter = Filter(min_allele_balance, max_allele_balance, allele_balance_tag, variant_sample_depth_tag, min_depth, exclude_filters, exclude_fields) reader.add_filter_to_header(refilter.filtered_header()) reader.add_info_to_header(refilter.rescued_header()) writer = Writer('-', reader) for variant in reader: refilter(variant) # Modifies variant filter status in place writer.write_record(variant)
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("--vcf", help="VCF file", type=str, required=True) parser.add_argument("--statsfile", help="File with chrom, start, locus stats", type=str, required=True) parser.add_argument("--out", help="Prefix for output files", type=str, required=True) parser.add_argument("--min-hwep", help="Minimum HWE p-value", type=float, default=0) parser.add_argument("--min-callrate", help="Minimum call rate", type=float, default=0) parser.add_argument("--min-het", help="Minimum heterozygosity", type=float, default=0) parser.add_argument("--max-hrun-offset", help="For periods 5+, discard if the ref has " \ "homopolymer run > period+offset", type=int, default=100000) parser.add_argument("--filter-segdup", help="Filter loci overlapping a segdup", action="store_true") args = parser.parse_args() # Get VCF reader reader = VCF(args.vcf) # Load locus filters sys.stderr.write("Getting filters...\n") locstats = pd.read_csv(args.statsfile, sep="\t") locstats["FILTER"] = locstats.apply(lambda x: GetFilters(x, args, len(reader.samples)), 1) locstats.to_csv(args.out + ".tab", sep="\t", index=False) # Get filter dictionary sys.stderr.write("Getting filter dictionary...\n") filter_dict = dict(zip(list(locstats["start"]), list(locstats["FILTER"]))) # Set filter field sys.stderr.write("Setting filter field in VCFs...\n") adict = { "HWE": "HWE less than %s"%args.min_hwep, "Callrate": "Callrate less than %s"%args.min_callrate, "Het": "Het less than %s"%args.min_het, "Hrun": "Hrun greater than %s"%args.max_hrun_offset, "Segdup": "Locus in a segmental duplication", "MissingInfo": "No stats provided for the locus", } for f in adict: reader.add_filter_to_header({"ID": f, "Description": adict[f]}) writer = Writer("/dev/stdout", reader) for record in reader: filters = filter_dict.get(record.INFO["START"], "MissingInfo") if filters != ".": record.FILTER = filters.split(";") else: record.FILTER = "PASS" writer.write_record(record) writer.close() reader.close()
def test_add_filter_to_header(): v = VCF(VCF_PATH) # NOTE that we have to add the filter to the header of the reader, # not the writer because the record will be associated with the reader v.add_filter_to_header({'ID': 'abcdefg', 'Description': 'abcdefg'}) f = tempfile.mktemp(suffix=".vcf") atexit.register(os.unlink, f) w = Writer(f, v) rec = v.next() rec.FILTER = ["abcdefg"] w.write_record(rec) w.close() v = next(VCF(f)) assert v.FILTER == "abcdefg", v.FILTER
def filter_cli(input, output, trash, params_file, index_sample, immediate_return): vcf = VCF(input, gts012=True) idx = vcf.samples.index(index_sample) for filter_item in list(FilterClass): vcf.add_filter_to_header(filter_item.value) out = Writer(output, vcf) tr = Writer(trash, vcf) filter_params = FilterParams(params_file) filter_it = Filterer(vcf, filter_params, idx, immediate_return) for record, fi in filter_it: if fi is None or len(fi) == 0: out.write_record(record) else: record.FILTER = [x.name for x in fi] tr.write_record(record) out.close() tr.close()
pass_threshold = 0 if args.conditions is None: wantFilters = False else: conditions = formatString(args.INFO_conditions) pass_threshold += 1 if args.GTconditions is None: wantGTFilters = False else: pass_threshold += 1 vcf = VCF(args.inputvcf) vcf.add_filter_to_header({ 'ID': args.filtername, 'Description': 'SV filter ' + args.filtername }) for v in vcf: #Apply variant filters if wantFilters == True: evaluateINFO(args.conditions_str, v) #result = vt.evaluateINFO(line,cond,args.logic) #if result == True: # is_filtered += 1 #Apply per-sample filters if wantGTFilters == True: cond = args.GTconditions.split(",")
# API and example # https://brentp.github.io/cyvcf2/ # https://brentp.github.io/cyvcf2/docstrings.html#api invcf = VCF( '/dev/stdin', lazy=True, gts012=True ) # if gts012=True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN. # invcf = VCF('test.vcf.gz', lazy=True) # adjust the header to contain the new field # the keys 'ID', 'Description', 'Type', and 'Number' are required. invcf.add_filter_to_header({ 'ID': 'VCFSiteMissingFilter.py', 'Description': 'Exclude the site with missing rate higher than > ' + str(MISS_THRESHOLD) }) # create a new vcf Writer using the input vcf as a template. # Only need to write out updated VCF header. # Other parts output as string. outvcf = Writer('/dev/stdout', invcf) outvcf.close() for variant in invcf: missing_rate = 1 - variant.call_rate if missing_rate <= MISS_THRESHOLD: ss = str(variant).split() sys.stdout.write(str(variant))
def fix_vcf(vcf, output, fai, jasmine=False): chromsizes = {line.split()[0]: int(line.split()[1]) for line in open(fai)} vcf_in = VCF(vcf) if not output: output = vcf.replace(".vcf", "_{}.vcf".format("fixed")) vcf_in.add_info_to_header({ 'ID': 'TRUNCATED', 'Description': "SVLEN truncated", 'Type': 'Flag', 'Number': '0' }) vcf_in.add_info_to_header({ 'ID': 'STRANDS2', 'Description': "alt reads first +,alt reads first -,alt reads second +,alt reads second -.", 'Type': 'Integer', 'Number': '4' }) vcf_in.add_info_to_header({ 'ID': 'Strandbias_pval', 'Description': "P-value for fisher exact test for strand bias.", 'Type': 'Float', 'Number': 'A' }) vcf_in.add_filter_to_header({ 'ID': 'STRANDBIAS', 'Description': "Strand is biased if Strandbias_pval< 0.01." }) if jasmine: vcf_in.add_info_to_header({ 'ID': 'STRANDS', 'Description': "foo", 'Type': 'String', 'Number': '1' }) vcf_in.add_info_to_header({ 'ID': 'AF', 'Description': "foo", 'Type': 'Float', 'Number': '1' }) handle, interm_output = tempfile.mkstemp(suffix=".vcf") vcf_out = Writer(interm_output, vcf_in) records_fixed = 0 records_truncated = 0 mito_variants = 0 interchromosomal_bnds = 0 for v in vcf_in: if v.CHROM == 'chrM': mito_variants += 1 continue if v.start == -1: v.set_pos(0) records_fixed += 1 try: if (v.INFO.get('SVTYPE') == 'BND') and (v.CHROM != v.INFO.get('CHR2')): del v.INFO['END'] interchromosomal_bnds += 1 except KeyError: pass try: if chromsizes[v.INFO.get('CHR2')] < v.INFO.get('END'): v.INFO['SVLEN'] = 1 v.INFO['END'] = v.start + 1 v.INFO['TRUNCATED'] = True records_truncated += 1 except KeyError: pass if v.INFO.get('SVLEN') == 999999999: v.INFO['SVLEN'] = 1 v.INFO['TRUNCATED'] = True vcf_out.write_record(v) vcf_out.close() vcf_sort(interm_output, output) if mito_variants != 0: sys.stderr.write(f"Removed {mito_variants} records on chrM.\n") if records_fixed != 0: sys.stderr.write(f"Fixed {records_fixed} records.\n") if records_truncated != 0: sys.stderr.write( f"Truncated {records_truncated} records where END > chromosome size\n" ) if interchromosomal_bnds != 0: sys.stderr.write( f"Dropped END for {interchromosomal_bnds} interchromosomal BNDs\n")
ShowFormat() sys.exit(-1) mingq = int(args['--mingq']) # API and example # https://brentp.github.io/cyvcf2/ # https://brentp.github.io/cyvcf2/docstrings.html#api invcf = VCF('/dev/stdin', lazy=True, gts012=True) # invcf = VCF('test.vcf.gz', lazy=True) # adjust the header to contain the new field # the keys 'ID', 'Description', 'Type', and 'Number' are required. invcf.add_filter_to_header({ 'ID': 'VCFGQFilter.py', 'Description': 'Mask genotype as missing if GQ value < ' + str(mingq) }) # create a new vcf Writer using the input vcf as a template. # Only need to write out updated VCF header. # Other parts output as string. sys.stdout.write('%s' % (invcf.raw_header)) # outvcf = Writer('/dev/stdout', invcf) # outvcf.close() # Cache data for faster process. DATA_COL = 9 FMT_COL = 8 FMT_STRING_CACHE = '' DP_COL = -1
for name, filt in filters.items(): if name in BUILTIN_FILTERS: if not isinstance(filt, tuple): filt = (filt, ) filters[name] = lambda variant: BUILTIN_FILTERS[name](variant, *filt) filters[name].__doc__ = BUILTIN_FILTERS[name].__doc__ else: filters[name] = eval(filt) filters[name].__doc__ = filter_descs.get(name, filt) invcf = VCF(infile) for name, filt in filters.items(): invcf.add_filter_to_header({ 'ID': name, 'Description': filt.__doc__, }) if outfile.endswith(".gz"): outvcf = Writer(outfile, invcf, "wz") else: outvcf = Writer(outfile, invcf) for variant in invcf: for name, filt in filters.items(): if not filt(variant): if not variant.FILTER: variant.FILTER = name else: variant.FILTER = f"{variant.FILTER};{name}" if variant.FILTER and not keep:
def add_filters_to_header(self, vcf: VCF): if self.min_depth > 0: header = { "ID": str(Tags.LowDepth), "Description": ( f"Depth ({Tags.Depth}) less than {self.min_depth} - i.e., {Tags.Depth}<{self.min_depth:.1f}" ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. depth: {header}") if self.min_fed > 0: header = { "ID": str(Tags.LowFed), "Description": ( f"High-quality depth of the called allele as a fraction of the expected (median; {self.expected_depth}) is " f"less than {self.min_fed}" ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. FED: {header}") if self.min_mq > 0: header = { "ID": str(Tags.LowMapQual), "Description": ( f"Mapping quality ({Tags.MapQual.value}) less than {self.min_mq} - i.e., {Tags.MapQual.value}<{self.min_mq:.0f}" ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. depth: {header}") if self.max_depth > 0: header = { "ID": str(Tags.HighDepth), "Description": ( f"Depth ({Tags.Depth}) more than {self.max_depth} - i.e., {Tags.Depth}>{self.max_depth:.1f}" ), } vcf.add_filter_to_header(header) logging.debug(f"Header for max. depth: {header}") if self.min_qual > 0: header = { "ID": str(Tags.LowQual), "Description": f"QUAL less than {self.min_qual}", } vcf.add_filter_to_header(header) logging.debug(f"Header for min. QUAL: {header}") if self.min_strand_bias > 0: header = { "ID": str(Tags.StrandBias), "Description": ( f"A strand on the called allele has less than " f"{self.min_strand_bias:.2%} of the high-quality depth for that " f"allele. This is judged on the {Tags.StrandDepth} tag." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for strand bias: {header}") if self.min_frs > 0: header = { "ID": str(Tags.LowSupport), "Description": f"Fraction of read support on called allele is less than {self.min_frs}", } vcf.add_filter_to_header(header) logging.debug(f"Header for min. FRS: {header}") if self.min_bqb > 0: header = { "ID": str(Tags.LowBaseQualBias), "Description": ( f"Base Quality Bias ({Tags.BaseQualBias}) is less than " f"{self.min_bqb}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. base quality bias: {header}") if self.min_mqb > 0: header = { "ID": str(Tags.LowMapQualBias), "Description": ( f"Mapping Quality Bias ({Tags.MapQualBias}) is less than " f"{self.min_mqb}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. mapping quality bias: {header}") if self.min_rpb > 0: header = { "ID": str(Tags.LowReadPosBias), "Description": ( f"Read Position Bias ({Tags.ReadPosBias}) is less than " f"{self.min_rpb}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. read position bias: {header}") if self.min_rpbz is not None: header = { "ID": str(Tags.LowReadPosBiasZ), "Description": ( f"Read Position Bias z-test score ({Tags.ReadPosBiasZ}) is less than " f"{self.min_rpbz}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. read position bias z-test: {header}") if self.max_rpbz is not None: header = { "ID": str(Tags.HighReadPosBiasZ), "Description": ( f"Read Position Bias z-test score ({Tags.ReadPosBiasZ}) is more than " f"{self.max_rpbz}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for max. read position bias z-test: {header}") if self.max_scbz is not None: header = { "ID": str(Tags.HighSoftClipBiasZ), "Description": ( f"Soft-Clip Length Bias z-test score ({Tags.SoftClipBiasZ}) is more than " f"{self.max_scbz}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for max. soft-clip length bias z-test: {header}") if self.max_sgb != 0: header = { "ID": str(Tags.HighSegBias), "Description": ( f"Segregation-based metric ({Tags.SegregationBias}) is greater " f"than {self.max_sgb}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for max. segregation bias: {header}") if self.min_vdb > 0: header = { "ID": str(Tags.LowVarDistBias), "Description": ( f"Variant distance bias ({Tags.VariantDistanceBias}) is less " f"than {self.min_vdb}." ), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. variant distance bias: {header}")
ShowFormat() sys.exit(-1) MRR_THRESHOLD = float(args['-c']) # the threshold minor reads ratio. # API and example # https://brentp.github.io/cyvcf2/ # https://brentp.github.io/cyvcf2/docstrings.html#api invcf = VCF('/dev/stdin', lazy=True, gts012=True) # invcf = VCF('test.vcf.gz', lazy=True) # adjust the header to contain the new field # the keys 'ID', 'Description', 'Type', and 'Number' are required. invcf.add_filter_to_header({ 'ID': 'VCFHOMOMinorReadsRatioFilter.py', 'Description': 'Mask the genotype as missing if the MRR >= ' + args['-c'] }) # create a new vcf Writer using the input vcf as a template. # Only need to write out updated VCF header. # Other parts output as string. sys.stdout.write('%s' % (invcf.raw_header)) # outvcf = Writer('/dev/stdout', invcf) # outvcf.close() # Cache data for faster process. DATA_COL = 9 FMT_COL = 8 FMT_STRING_CACHE = '' DP_COL = -1
def add_filters_to_header(self, vcf: VCF): if self.min_depth > 0: header = { "ID": str(Tags.LowDepth), "Description": (f"Depth ({Tags.Depth}) less than {self.min_depth_frac:.1%} the " f"expected depth of {self.expected_depth:.1f}. " f"{Tags.Depth}<{self.min_depth:.1f}"), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. depth: {header}") if self.max_depth > 0: header = { "ID": str(Tags.HighDepth), "Description": (f"Depth ({Tags.Depth}) more than {self.max_depth_frac:.1%} the " f"expected depth of {self.expected_depth:.1f}. " f"{Tags.Depth}>{self.max_depth:.1f}"), } vcf.add_filter_to_header(header) logging.debug(f"Header for max. depth: {header}") if self.min_qual > 0: header = { "ID": str(Tags.LowQual), "Description": f"QUAL less than {self.min_qual}", } vcf.add_filter_to_header(header) logging.debug(f"Header for min. QUAL: {header}") if self.min_strand_bias > 0: header = { "ID": str(Tags.StrandBias), "Description": (f"A strand on the called allele has less than " f"{self.min_strand_bias:.2%} of the high-quality depth for that " f"allele. This is judged on the {Tags.StrandDepth} tag."), } vcf.add_filter_to_header(header) logging.debug(f"Header for strand bias: {header}") if self.min_bqb > 0: header = { "ID": str(Tags.LowBaseQualBias), "Description": (f"Base Quality Bias ({Tags.BaseQualBias}) is less than " f"{self.min_bqb}."), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. base quality bias: {header}") if self.min_mqb > 0: header = { "ID": str(Tags.LowMapQualBias), "Description": (f"Mapping Quality Bias ({Tags.MapQualBias}) is less than " f"{self.min_mqb}."), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. mapping quality bias: {header}") if self.min_rpb > 0: header = { "ID": str(Tags.LowReadPosBias), "Description": (f"Read Position Bias ({Tags.ReadPosBias}) is less than " f"{self.min_rpb}."), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. read position bias: {header}") if self.max_sgb != 0: header = { "ID": str(Tags.HighSegBias), "Description": (f"Segregation-based metric ({Tags.SegregationBias}) is greater " f"than {self.max_sgb}."), } vcf.add_filter_to_header(header) logging.debug(f"Header for max. segregation bias: {header}") if self.min_vdb > 0: header = { "ID": str(Tags.LowVarDistBias), "Description": (f"Variant distance bias ({Tags.VariantDistanceBias}) is less " f"than {self.min_vdb}."), } vcf.add_filter_to_header(header) logging.debug(f"Header for min. variant distance bias: {header}")