def main(): """The main function """ parser = cmdline_parser() args = parser.parse_args() if args.verbose: LOG.setLevel(logging.INFO) if args.debug: LOG.setLevel(logging.DEBUG) assert os.path.exists(args.bam), ("BAM file %s does not exist" % args.bam) samfh = pysam.Samfile(args.bam) # setup vcf_reader # if args.vcfin == '-': vcf_reader = vcf.VCFReader(sys.stdin) else: vcf_reader = vcf.VCFReader(filename=args.vcfin) variants = [r for r in vcf_reader] LOG.info("Loaded %d variants" % len(variants)) if args.mtc.lower() != 'None': LOG.info("Will use %s for MTC on %s with alpha %f" % (args.mtc, args.mtc_tag, args.mtc_alpha)) else: LOG.info("No multiple testing correction will be done") # setup vcf_writer # if args.vcfout == '-': fh_out = sys.stdout else: if os.path.exists(args.vcfout): LOG.fatal( "Cowardly refusing to overwrite already existing file %s" % (args.vcfout)) sys.exit(1) if args.vcfout[-3:] == '.gz': fh_out = gzip.open(args.vcfout, 'w') else: fh_out = open(args.vcfout, 'w') # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf clone didn't vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep) #vcf_writer = vcf.VCFWriter(fh_out) #vcf_writer.meta_from_reader(vcf_reader) pvalues = [] for (var_no, var) in enumerate(variants): if var_no % 500 == 1: LOG.info("Computing bias for var %d of %d" % (var_no, len(variants))) if var.INFO.has_key('INDEL'): LOG.warn("Skipping unsupported indel variant %s:%d" % (var.CHROM, var.POS)) continue reads = list( samfh.fetch(reference=var.CHROM, start=var.POS - 1, end=var.POS)) LOG.debug("%s %d: %d (unfiltered) reads covering position" % (var.CHROM, var.POS, len(reads))) ref_mquals = [] alt_mquals = [] ref_bquals = [] alt_bquals = [] # only for PE #ref_isize = [] #alt_isize = [] # following two meant to test #alt_vpos = [] #rlens = [] for r in reads: if skip_read(r): continue orphan = (r.flag & 0x1) and not (r.flag & 0x2) if orphan and not args.use_orphan: continue if r.mapq < args.min_mq: continue vpos_on_read = [ vpos_on_read for (vpos_on_read, vpos_on_ref) in r.aligned_pairs if vpos_on_ref == var.POS - 1 ] assert len(vpos_on_read) == 1 vpos_on_read = vpos_on_read[0] if vpos_on_read == None: # skip deletions continue #alt_vpos.append(vpos_on_read) #rlens.append(r.rlen) b = r.query[vpos_on_read] bq = ord(r.qqual[vpos_on_read]) - 33 mq = r.mapq if bq < args.min_bq: continue assert len(var.REF) == 1 and len(var.ALT) == 1 if b.upper() == var.REF[0].upper(): ref_mquals.append(mq) ref_bquals.append(bq) #if not args.use_orphan: # ref_isize.append(abs(r.tlen)) elif b.upper() == str(var.ALT[0]).upper(): alt_mquals.append(mq) alt_bquals.append(bq) #if not args.use_orphan: # alt_isize.append(abs(r.tlen)) else: LOG.debug("Skipping non-ref-alt base %s at %s:%d" % (b, var.CHROM, var.POS)) continue LOG.debug("After filtering at %s:%d: %d ref mquals and %d alt mquals" % (var.CHROM, var.POS, len(ref_mquals), len(alt_mquals))) # mannwhitneyu fails if all values the same if len(set(ref_mquals).union(alt_mquals)) == 1: m_pv = 1.0 elif len(ref_mquals) == 0 or len(alt_mquals) == 0: m_pv = 1.0 else: # compute only if alternate quals are smaller on average if mean(alt_mquals) < mean(ref_mquals): ustat = mannwhitneyu(ref_mquals, alt_mquals) m_pv = ustat[1] else: m_pv = 1.0 # same for bqs if len(set(ref_bquals).union(alt_bquals)) == 1: b_pv = 1.0 elif len(ref_bquals) == 0 or len(alt_bquals) == 0: b_pv = 1.0 else: if mean(alt_bquals) < mean(ref_bquals): ustat = mannwhitneyu(ref_bquals, alt_bquals) b_pv = ustat[1] else: b_pv = 1.0 # same for isize-qs #if len(ref_isize) and len(alt_isize): # if len(set(ref_isize).union(alt_isize))==1: # i_pv = 1 # else: # ustat = mannwhitneyu(ref_isize, alt_isize) # i_pv = ustat[1] #else: # i_pv = 1 c_pv = fisher_comb(m_pv, b_pv) #import pdb; pdb.set_trace() LOG.debug("%s %d: mb %f bb %f cb %f" % (var.CHROM, var.POS, m_pv, b_pv, c_pv)) var.INFO['MB'] = prob_to_phredqual(m_pv) var.INFO['BB'] = prob_to_phredqual(b_pv) #var.INFO['IB'] = prob_to_phredqual(i_pv) var.INFO['CB'] = prob_to_phredqual(c_pv) if args.mtc.lower() != 'none': pvalues.append(phredqual_to_prob(int(var.INFO[args.mtc_tag]))) if args.mtc.lower() != 'none': ftag = "%s<%f" % (args.mtc, args.mtc_alpha) rej_idxs = [] if args.mtc == 'bonf': rej_idxs = [ i for (i, p) in enumerate( multiple_testing.Bonferroni(pvalues).corrected_pvals) if p < args.mtc_alpha ] elif args.mtc == 'holmbonf': rej_idxs = [ i for (i, p) in enumerate( multiple_testing.Bonferroni(pvalues).corrected_pvals) if p < args.mtc_alpha ] elif args.mtc == 'fdr': rej_idxs = fdr.fdr(pvalues, a=args.mtc_alpha) else: raise ValueError("unknown MTC method %s" % args.mtc) for i in rej_idxs: # pyvcf filter is empty if not set. lofreq's vcf clone was . or PASS #if not variants[i].FILTER or variants[i].FILTER in [".", "PASS"]: # new_f = [ftag] #else: # new_f = "%s;%s" % (variants[i].FILTER, ftag) #variants[i] = variants[i]._replace(FILTER=new_f) variants[i].FILTER.append(ftag) LOG.info("%d of %d variants didn't pass filter" % (len(rej_idxs), len(variants))) # pyvcf doesn't need write_metainfo or write_header #vcf_writer.write_metainfo() #vcf_writer.write_header() for var in variants: filtered = len(var.FILTER) > 0 and var.FILTER not in [".", "PASS"] if args.pass_only and filtered: continue # LoFreq's vcf clone called this write_rec() vcf_writer.write_record(var) if fh_out != sys.stdout: fh_out.close()
def main(): """main function """ tmp_vcf_markup = [] parser = cmdline_parser() # WARNING: undocumented arg to remove all defaults (and the reason # why we have to use OptParse) if '--no-defaults' in sys.argv: for (k, v) in parser.defaults.items(): parser.defaults[k] = None sys.argv = [x for x in sys.argv if x != "--no-defaults"] (opts, args) = parser.parse_args() if len(args): parser.error("Unrecognized arguments found: %s." % ( ' '.join(args))) sys.exit(1) if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) for (in_file, descr) in [(opts.vcf_in, "VCF")]: if not in_file: parser.error("%s input file argument missing." % descr) sys.exit(1) if not os.path.exists(in_file) and in_file != "-": sys.stderr.write( "file '%s' does not exist.\n" % in_file) sys.exit(1) for (out_file, descr) in [(opts.vcf_out, "VCF output file")]: if not out_file: parser.error("%s output file argument missing." % descr) sys.exit(1) if os.path.exists(out_file) and out_file!="-": sys.stderr.write("Cowardly refusing to overwrite existing" " output file '%s'.\n" % out_file) sys.exit(1) if opts.vcf_in == '-': vcf_reader = vcf.VCFReader(sys.stdin) else: if opts.vcf_in[-3:] == '.gz': vcf_reader = vcf.VCFReader(gzip.open(opts.vcf_in,'r')) else: vcf_reader = vcf.VCFReader(open(opts.vcf_in,'r')) snvs = [r for r in vcf_reader] LOG.info("Parsed %d SNVs from %s" % (len(snvs), opts.vcf_in)) # list of tuples: first element is a filter func, which takes a # snv and a filter-id as input. second is the filter id. variant # will be marked as filtered if func returns True filters = [] if opts.min_af != None: vcf_filter = vcf._Filter( id=("minaf%f" % opts.min_af).rstrip('0'), desc="Minimum allele frequency") vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer filters.append(( lambda s, f_id: f_id if s.INFO['AF'] < opts.min_af else None, vcf_filter.id )) if opts.max_cov != None: if not all([s.INFO.has_key('DP') for s in snvs]): LOG.error("At least one SNV was not annotated with depth info (DP)" " (was this file produced with LoFreq?).") sys.exit(1) vcf_filter = vcf._Filter( id="maxcov%d" % opts.max_cov, desc="Maximum coverage") vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer filters.append(( lambda s, f_id: f_id if s.INFO['DP'] > opts.max_cov else None, vcf_filter.id )) if opts.min_cov != None: if not all([s.INFO.has_key('DP') for s in snvs]): LOG.error("At least one SNV was not annotated with depth info (DP)" " (was this file produced with LoFreq?).") sys.exit(1) vcf_filter = vcf._Filter( id="mincov%d" % opts.min_cov, desc="Minimum coverage") vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer filters.append(( lambda s, f_id: f_id if s.INFO['DP'] < opts.min_cov else None, vcf_filter.id )) # structured as opts.snv_qual filtering, but keeps corrected # values. if opts.strandbias != None: if opts.strandbias in ['bonf', 'holm-bonf']: if not opts.strandbias_alpha: LOG.fatal("Need alpha/significance threshold for strandbias" " multiple testing correction") sys.exit(1) vcf_filter = vcf._Filter( id="strandbias%s" % opts.strandbias.replace("-", ""), desc="Strand-bias filter (%s corrected < %g)" % ( opts.strandbias, opts.strandbias_alpha)) vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer if opts.strandbias == 'bonf': vcf_info_id = "SBBC" elif opts.strandbias == 'holm-bonf': vcf_info_id = "SBHBC" else: raise ValueError vcf_info = vcf._Info( id=vcf_info_id, num=1, type='Integer', desc="Strand-bias %s corrected" % opts.strandbias) vcf_reader.infos[vcf_info.id] = vcf_info try: pvals = (phredqual_to_prob(s.INFO['SB']) for s in snvs) except (KeyError, AssertionError) as e: LOG.error("At least one SNV was not annotated properly with" " strandbias info (SB)" " (was this file produced with LoFreq?)" " You will need to switch strandbias filtering off") sys.exit(1) if opts.strandbias == 'bonf': corr_pvals = multiple_testing.Bonferroni( pvals).corrected_pvals elif opts.strandbias == 'holm-bonf': corr_pvals = multiple_testing.HolmBonferroni( pvals).corrected_pvals else: raise ValueError for (cp, s) in zip(corr_pvals, snvs): s.INFO[vcf_info.id] = prob_to_phredqual(cp) if s.INFO[vcf_info.id] > MAX_INT: s.INFO[vcf_info.id] = MAX_INT filters.append(( lambda s, f_id: f_id if s.INFO[vcf_info.id] > prob_to_phredqual(opts.strandbias_alpha) else None, vcf_filter.id )) # int elif opts.strandbias != 'off': try: max_strandbias_phred = int(opts.strandbias) assert max_strandbias_phred >= 0 except (ValueError, AssertionError) as e: LOG.fatal("Invalid strandbias argument: %s" % (opts.strandbias)) sys.exit(1) vcf_filter = vcf._Filter( max_strandbias_phred = int( id="sbp%d" % opts.max_strandbias_phred, desc="Phred-based strand-bias filter (max)")) vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer filters.append(( lambda s, f_id: f_id if float(s.INFO['SB']) > opts.max_strandbias_phred else None, vcf_filter.id )) # structured as opts.strandbias filtering, but doesn't keep # corrected values. if opts.snv_qual != None: if opts.snv_qual in ['bonf', 'holm-bonf', 'fdr']: if not opts.snv_qual_alpha: LOG.fatal("Need alpha/significance threshold for snv quality" " multiple testing correction") sys.exit(1) vcf_filter = vcf._Filter( id="snvqual%s" % opts.snv_qual.replace("-", ""), desc="SNV quality filter (%s corrected < %g)" % ( opts.snv_qual, opts.snv_qual_alpha)) vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer vcf_info_id = "SNVQUALPASS" # tmp markup tmp_vcf_markup.append(vcf_info_id) pvals = [] pidx = [] for (i, s) in enumerate(snvs): # if qual is not NA, convert to pvalue, else don't # use filter (set filter to NA) if s.QUAL != '.': pvals.append(phredqual_to_prob(s.QUAL)) pidx.append(i) s.INFO[vcf_info_id] = 0 else: s.INFO[vcf_info_id] = '.' if opts.snv_qual == 'bonf': for (i, p) in enumerate( multiple_testing.Bonferroni( pvals, n=opts.snv_qual_numtests).corrected_pvals): if p <= opts.snv_qual_alpha: snvs[pidx[i]].INFO[vcf_info_id] = 1 elif opts.snv_qual == 'holm-bonf': for (i, p) in enumerate( multiple_testing.HolmBonferroni( pvals, n=opts.snv_qual_numtests).corrected_pvals): if p <= opts.snv_qual_alpha: snvs[pidx[i]].INFO[vcf_info_id] = 1 elif opts.snv_qual == 'fdr': for i in fdr.fdr(pvals, a=opts.snv_qual_alpha, n=opts.snv_qual_numtests): snvs[pidx[i]].INFO[vcf_info_id] = 1 else: raise ValueError filters.append(( lambda s, f_id: f_id if s.INFO[vcf_info_id] != '.' and s.INFO[vcf_info_id] == 0 else None, vcf_filter.id )) elif opts.snv_qual != 'off': try: min_qual = int(opts.snv_qual) assert min_qual >= 0 except (ValueError, AssertionError) as e: LOG.fatal("Invalid snv quality argument: %s" % (opts.snv_qual)) sys.exit(1) vcf_filter = vcf._Filter( id="minqual%d" % min_qual, desc="Minimum SNV quality") vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer filters.append(( lambda s, f_id: f_id if s.QUAL != '.' and s.QUAL < min_qual else None, vcf_filter.id )) if opts.window_size != None: vcf_filter = vcf._Filter( id="snvwin%d" % opts.window_size, desc="SNV window filter (SNVs within %d bp distance)" % ( opts.window_size)) vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer vcf_info_id = "SNVWINPASS" # tmp markup tmp_vcf_markup.append(vcf_info_id) snvs_on_cur_chrom = [] last_chrom = None seen_chroms = [] for (i, cur_snv) in enumerate(snvs): # assumes snvs are sorted by chrom if i == 0: last_chrom = cur_snv.CHROM if cur_snv.CHROM != last_chrom: assert cur_snv.CHROM not in seen_chroms, ( "SNV input not ordered by chromosome." " Sure this file was procuced by LoFreq?") win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id) seen_chroms.append(last_chrom) last_chrom = cur_snv.CHROM snvs_on_cur_chrom = [cur_snv] else: snvs_on_cur_chrom.append(cur_snv) # don't forget last chrom win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id) filters.append(( lambda s, f_id: f_id if s.INFO[vcf_info_id] != '.' and s.INFO[vcf_info_id] == 0 else None, vcf_filter.id )) # The actual filtering: if filter function returns 1 the # corresponding snv has to be filtered # # FIXME can't this be done easier with map()? # if len(filters) == 0: LOG.error("No filters activated.") sys.exit(1) #import pdb; pdb.set_trace() for (filter_func, filter_id) in filters: for (i, s) in enumerate(snvs): f = filter_func(s, filter_id) if f: # just s = s.__replace() can't work if s.FILTER == '.' or s.FILTER == 'PASS': snvs[i] = s._replace(FILTER=f) else: snvs[i] = s._replace(FILTER="%s;%s" % (s.FILTER, f)) # should all also work if we get already PASSed input n_passed = 0 for (i, s) in enumerate(snvs): if s.FILTER == '.': snvs[i] = s._replace(FILTER="PASS") n_passed += 1 LOG.info("%d SNVs passed all filters." % n_passed) # remove temporary markup for tmpkey in tmp_vcf_markup: for s in snvs: if s.INFO.has_key(tmpkey): del s.INFO[tmpkey] if opts.pass_only: snvs = (s for s in snvs if s.FILTER == 'PASS') if opts.vcf_out == '-': fh_out = sys.stdout else: if opts.vcf_out[-3:] == '.gz': fh_out = gzip.open(opts.vcf_out, 'w') else: fh_out = open(opts.vcf_out, 'w') vcf_writer = vcf.VCFWriter(fh_out) vcf_writer.meta_from_reader(vcf_reader) vcf_writer.write(snvs) if fh_out != sys.stdout: fh_out.close()
def main(): """The main function """ parser = cmdline_parser() args = parser.parse_args() if args.verbose: LOG.setLevel(logging.INFO) if args.debug: LOG.setLevel(logging.DEBUG) assert os.path.exists(args.bam), ( "BAM file %s does not exist" % args.bam) samfh = pysam.Samfile(args.bam) # setup vcf_reader # if args.vcfin == '-': vcf_reader = vcf.VCFReader(sys.stdin) else: vcf_reader = vcf.VCFReader(filename=args.vcfin) variants = [r for r in vcf_reader] LOG.info("Loaded %d variants" % len(variants)) if args.mtc.lower() != 'None': LOG.info("Will use %s for MTC on %s with alpha %f" % ( args.mtc, args.mtc_tag, args.mtc_alpha)) else: LOG.info("No multiple testing correction will be done") # setup vcf_writer # if args.vcfout == '-': fh_out = sys.stdout else: if os.path.exists(args.vcfout): LOG.fatal("Cowardly refusing to overwrite already existing file %s" % (args.vcfout)) sys.exit(1) if args.vcfout[-3:] == '.gz': fh_out = gzip.open(args.vcfout, 'w') else: fh_out = open(args.vcfout, 'w') # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf clone didn't vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep) #vcf_writer = vcf.VCFWriter(fh_out) #vcf_writer.meta_from_reader(vcf_reader) pvalues = [] for (var_no, var) in enumerate(variants): if var_no%500==1: LOG.info("Computing bias for var %d of %d" % (var_no, len(variants))) if var.INFO.has_key('INDEL'): LOG.warn("Skipping unsupported indel variant %s:%d" % (var.CHROM, var.POS)) continue reads = list(samfh.fetch(reference=var.CHROM, start=var.POS-1, end=var.POS)) LOG.debug("%s %d: %d (unfiltered) reads covering position" % ( var.CHROM, var.POS, len(reads))) ref_mquals = [] alt_mquals = [] ref_bquals = [] alt_bquals = [] # only for PE #ref_isize = [] #alt_isize = [] # following two meant to test #alt_vpos = [] #rlens = [] for r in reads: if skip_read(r): continue orphan = (r.flag & 0x1) and not (r.flag & 0x2) if orphan and not args.use_orphan: continue if r.mapq < args.min_mq: continue vpos_on_read = [vpos_on_read for (vpos_on_read, vpos_on_ref) in r.aligned_pairs if vpos_on_ref==var.POS-1] assert len(vpos_on_read)==1 vpos_on_read = vpos_on_read[0] if vpos_on_read == None:# skip deletions continue #alt_vpos.append(vpos_on_read) #rlens.append(r.rlen) b = r.query[vpos_on_read] bq = ord(r.qqual[vpos_on_read])-33 mq = r.mapq if bq < args.min_bq: continue assert len(var.REF)==1 and len(var.ALT)==1 if b.upper() == var.REF[0].upper(): ref_mquals.append(mq) ref_bquals.append(bq) #if not args.use_orphan: # ref_isize.append(abs(r.tlen)) elif b.upper() == str(var.ALT[0]).upper(): alt_mquals.append(mq) alt_bquals.append(bq) #if not args.use_orphan: # alt_isize.append(abs(r.tlen)) else: LOG.debug("Skipping non-ref-alt base %s at %s:%d" % (b, var.CHROM, var.POS)) continue LOG.debug("After filtering at %s:%d: %d ref mquals and %d alt mquals" % ( var.CHROM, var.POS, len(ref_mquals), len(alt_mquals))) # mannwhitneyu fails if all values the same if len(set(ref_mquals).union(alt_mquals))==1: m_pv = 1.0 elif len(ref_mquals)==0 or len(alt_mquals)==0: m_pv = 1.0 else: # compute only if alternate quals are smaller on average if mean(alt_mquals) < mean(ref_mquals): ustat = mannwhitneyu(ref_mquals, alt_mquals) m_pv = ustat[1] else: m_pv = 1.0 # same for bqs if len(set(ref_bquals).union(alt_bquals))==1: b_pv = 1.0 elif len(ref_bquals)==0 or len(alt_bquals)==0: b_pv = 1.0 else: if mean(alt_bquals) < mean(ref_bquals): ustat = mannwhitneyu(ref_bquals, alt_bquals) b_pv = ustat[1] else: b_pv = 1.0 # same for isize-qs #if len(ref_isize) and len(alt_isize): # if len(set(ref_isize).union(alt_isize))==1: # i_pv = 1 # else: # ustat = mannwhitneyu(ref_isize, alt_isize) # i_pv = ustat[1] #else: # i_pv = 1 c_pv = fisher_comb(m_pv, b_pv) #import pdb; pdb.set_trace() LOG.debug("%s %d: mb %f bb %f cb %f" % (var.CHROM, var.POS, m_pv, b_pv, c_pv)) var.INFO['MB'] = prob_to_phredqual(m_pv) var.INFO['BB'] = prob_to_phredqual(b_pv) #var.INFO['IB'] = prob_to_phredqual(i_pv) var.INFO['CB'] = prob_to_phredqual(c_pv) if args.mtc.lower() != 'none': pvalues.append(phredqual_to_prob(int(var.INFO[args.mtc_tag]))) if args.mtc.lower() != 'none': ftag = "%s<%f" % (args.mtc, args.mtc_alpha) rej_idxs = [] if args.mtc == 'bonf': rej_idxs = [i for (i, p) in enumerate(multiple_testing.Bonferroni(pvalues).corrected_pvals) if p<args.mtc_alpha] elif args.mtc == 'holmbonf': rej_idxs = [i for (i, p) in enumerate(multiple_testing.Bonferroni(pvalues).corrected_pvals) if p<args.mtc_alpha] elif args.mtc == 'fdr': rej_idxs = fdr.fdr(pvalues, a=args.mtc_alpha) else: raise ValueError("unknown MTC method %s" % args.mtc) for i in rej_idxs: # pyvcf filter is empty if not set. lofreq's vcf clone was . or PASS #if not variants[i].FILTER or variants[i].FILTER in [".", "PASS"]: # new_f = [ftag] #else: # new_f = "%s;%s" % (variants[i].FILTER, ftag) #variants[i] = variants[i]._replace(FILTER=new_f) variants[i].FILTER.append(ftag) LOG.info("%d of %d variants didn't pass filter" % ( len(rej_idxs), len(variants))) # pyvcf doesn't need write_metainfo or write_header #vcf_writer.write_metainfo() #vcf_writer.write_header() for var in variants: filtered = len(var.FILTER)>0 and var.FILTER not in [".", "PASS"] if args.pass_only and filtered: continue # LoFreq's vcf clone called this write_rec() vcf_writer.write_record(var) if fh_out != sys.stdout: fh_out.close()
def main(): """main function """ tmp_vcf_markup = [] parser = cmdline_parser() # WARNING: undocumented arg to remove all defaults (and the reason # why we have to use OptParse) if '--no-defaults' in sys.argv: for (k, v) in parser.defaults.items(): parser.defaults[k] = None sys.argv = [x for x in sys.argv if x != "--no-defaults"] (opts, args) = parser.parse_args() if len(args): parser.error("Unrecognized arguments found: %s." % (' '.join(args))) sys.exit(1) if opts.verbose: LOG.setLevel(logging.INFO) if opts.debug: LOG.setLevel(logging.DEBUG) for (in_file, descr) in [(opts.vcf_in, "VCF")]: if not in_file: parser.error("%s input file argument missing." % descr) sys.exit(1) if not os.path.exists(in_file) and in_file != "-": sys.stderr.write("file '%s' does not exist.\n" % in_file) sys.exit(1) for (out_file, descr) in [(opts.vcf_out, "VCF output file")]: if not out_file: parser.error("%s output file argument missing." % descr) sys.exit(1) if os.path.exists(out_file) and out_file != "-": sys.stderr.write("Cowardly refusing to overwrite existing" " output file '%s'.\n" % out_file) sys.exit(1) if opts.vcf_in == '-': vcf_reader = vcf.VCFReader(sys.stdin) else: if opts.vcf_in[-3:] == '.gz': vcf_reader = vcf.VCFReader(gzip.open(opts.vcf_in, 'r')) else: vcf_reader = vcf.VCFReader(open(opts.vcf_in, 'r')) snvs = [r for r in vcf_reader] LOG.info("Parsed %d SNVs from %s" % (len(snvs), opts.vcf_in)) # list of tuples: first element is a filter func, which takes a # snv and a filter-id as input. second is the filter id. variant # will be marked as filtered if func returns True filters = [] if opts.min_af != None: vcf_filter = vcf._Filter(id=("minaf%f" % opts.min_af).rstrip('0'), desc="Minimum allele frequency") vcf_reader.filters[ vcf_filter.id] = vcf_filter # reader serves as template for writer filters.append( (lambda s, f_id: f_id if s.INFO['AF'] < opts.min_af else None, vcf_filter.id)) if opts.max_cov != None: if not all([s.INFO.has_key('DP') for s in snvs]): LOG.error("At least one SNV was not annotated with depth info (DP)" " (was this file produced with LoFreq?).") sys.exit(1) vcf_filter = vcf._Filter(id="maxcov%d" % opts.max_cov, desc="Maximum coverage") vcf_reader.filters[ vcf_filter.id] = vcf_filter # reader serves as template for writer filters.append( (lambda s, f_id: f_id if s.INFO['DP'] > opts.max_cov else None, vcf_filter.id)) if opts.min_cov != None: if not all([s.INFO.has_key('DP') for s in snvs]): LOG.error("At least one SNV was not annotated with depth info (DP)" " (was this file produced with LoFreq?).") sys.exit(1) vcf_filter = vcf._Filter(id="mincov%d" % opts.min_cov, desc="Minimum coverage") vcf_reader.filters[ vcf_filter.id] = vcf_filter # reader serves as template for writer filters.append( (lambda s, f_id: f_id if s.INFO['DP'] < opts.min_cov else None, vcf_filter.id)) # structured as opts.snv_qual filtering, but keeps corrected # values. if opts.strandbias != None: if opts.strandbias in ['bonf', 'holm-bonf']: if not opts.strandbias_alpha: LOG.fatal("Need alpha/significance threshold for strandbias" " multiple testing correction") sys.exit(1) vcf_filter = vcf._Filter( id="strandbias%s" % opts.strandbias.replace("-", ""), desc="Strand-bias filter (%s corrected < %g)" % (opts.strandbias, opts.strandbias_alpha)) vcf_reader.filters[ vcf_filter. id] = vcf_filter # reader serves as template for writer if opts.strandbias == 'bonf': vcf_info_id = "SBBC" elif opts.strandbias == 'holm-bonf': vcf_info_id = "SBHBC" else: raise ValueError vcf_info = vcf._Info(id=vcf_info_id, num=1, type='Integer', desc="Strand-bias %s corrected" % opts.strandbias) vcf_reader.infos[vcf_info.id] = vcf_info try: pvals = (phredqual_to_prob(s.INFO['SB']) for s in snvs) except (KeyError, AssertionError) as e: LOG.error("At least one SNV was not annotated properly with" " strandbias info (SB)" " (was this file produced with LoFreq?)" " You will need to switch strandbias filtering off") sys.exit(1) if opts.strandbias == 'bonf': corr_pvals = multiple_testing.Bonferroni(pvals).corrected_pvals elif opts.strandbias == 'holm-bonf': corr_pvals = multiple_testing.HolmBonferroni( pvals).corrected_pvals else: raise ValueError for (cp, s) in zip(corr_pvals, snvs): s.INFO[vcf_info.id] = prob_to_phredqual(cp) if s.INFO[vcf_info.id] > MAX_INT: s.INFO[vcf_info.id] = MAX_INT filters.append( (lambda s, f_id: f_id if s.INFO[vcf_info.id] > prob_to_phredqual( opts.strandbias_alpha) else None, vcf_filter.id)) # int elif opts.strandbias != 'off': try: max_strandbias_phred = int(opts.strandbias) assert max_strandbias_phred >= 0 except (ValueError, AssertionError) as e: LOG.fatal("Invalid strandbias argument: %s" % (opts.strandbias)) sys.exit(1) vcf_filter = vcf._Filter(max_strandbias_phred=int( id="sbp%d" % opts.max_strandbias_phred, desc="Phred-based strand-bias filter (max)")) vcf_reader.filters[ vcf_filter. id] = vcf_filter # reader serves as template for writer filters.append( (lambda s, f_id: f_id if float(s.INFO['SB']) > opts.max_strandbias_phred else None, vcf_filter.id)) # structured as opts.strandbias filtering, but doesn't keep # corrected values. if opts.snv_qual != None: if opts.snv_qual in ['bonf', 'holm-bonf', 'fdr']: if not opts.snv_qual_alpha: LOG.fatal("Need alpha/significance threshold for snv quality" " multiple testing correction") sys.exit(1) vcf_filter = vcf._Filter( id="snvqual%s" % opts.snv_qual.replace("-", ""), desc="SNV quality filter (%s corrected < %g)" % (opts.snv_qual, opts.snv_qual_alpha)) vcf_reader.filters[ vcf_filter. id] = vcf_filter # reader serves as template for writer vcf_info_id = "SNVQUALPASS" # tmp markup tmp_vcf_markup.append(vcf_info_id) pvals = [] pidx = [] for (i, s) in enumerate(snvs): # if qual is not NA, convert to pvalue, else don't # use filter (set filter to NA) if s.QUAL != '.': pvals.append(phredqual_to_prob(s.QUAL)) pidx.append(i) s.INFO[vcf_info_id] = 0 else: s.INFO[vcf_info_id] = '.' if opts.snv_qual == 'bonf': for (i, p) in enumerate( multiple_testing.Bonferroni( pvals, n=opts.snv_qual_numtests).corrected_pvals): if p <= opts.snv_qual_alpha: snvs[pidx[i]].INFO[vcf_info_id] = 1 elif opts.snv_qual == 'holm-bonf': for (i, p) in enumerate( multiple_testing.HolmBonferroni( pvals, n=opts.snv_qual_numtests).corrected_pvals): if p <= opts.snv_qual_alpha: snvs[pidx[i]].INFO[vcf_info_id] = 1 elif opts.snv_qual == 'fdr': for i in fdr.fdr(pvals, a=opts.snv_qual_alpha, n=opts.snv_qual_numtests): snvs[pidx[i]].INFO[vcf_info_id] = 1 else: raise ValueError filters.append((lambda s, f_id: f_id if s.INFO[vcf_info_id] != '.' and s.INFO[ vcf_info_id] == 0 else None, vcf_filter.id)) elif opts.snv_qual != 'off': try: min_qual = int(opts.snv_qual) assert min_qual >= 0 except (ValueError, AssertionError) as e: LOG.fatal("Invalid snv quality argument: %s" % (opts.snv_qual)) sys.exit(1) vcf_filter = vcf._Filter(id="minqual%d" % min_qual, desc="Minimum SNV quality") vcf_reader.filters[ vcf_filter. id] = vcf_filter # reader serves as template for writer filters.append((lambda s, f_id: f_id if s.QUAL != '.' and s.QUAL < min_qual else None, vcf_filter.id)) if opts.window_size != None: vcf_filter = vcf._Filter( id="snvwin%d" % opts.window_size, desc="SNV window filter (SNVs within %d bp distance)" % (opts.window_size)) vcf_reader.filters[ vcf_filter.id] = vcf_filter # reader serves as template for writer vcf_info_id = "SNVWINPASS" # tmp markup tmp_vcf_markup.append(vcf_info_id) snvs_on_cur_chrom = [] last_chrom = None seen_chroms = [] for (i, cur_snv) in enumerate(snvs): # assumes snvs are sorted by chrom if i == 0: last_chrom = cur_snv.CHROM if cur_snv.CHROM != last_chrom: assert cur_snv.CHROM not in seen_chroms, ( "SNV input not ordered by chromosome." " Sure this file was procuced by LoFreq?") win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id) seen_chroms.append(last_chrom) last_chrom = cur_snv.CHROM snvs_on_cur_chrom = [cur_snv] else: snvs_on_cur_chrom.append(cur_snv) # don't forget last chrom win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id) filters.append((lambda s, f_id: f_id if s.INFO[vcf_info_id] != '.' and s.INFO[vcf_info_id] == 0 else None, vcf_filter.id)) # The actual filtering: if filter function returns 1 the # corresponding snv has to be filtered # # FIXME can't this be done easier with map()? # if len(filters) == 0: LOG.error("No filters activated.") sys.exit(1) #import pdb; pdb.set_trace() for (filter_func, filter_id) in filters: for (i, s) in enumerate(snvs): f = filter_func(s, filter_id) if f: # just s = s.__replace() can't work if s.FILTER == '.' or s.FILTER == 'PASS': snvs[i] = s._replace(FILTER=f) else: snvs[i] = s._replace(FILTER="%s;%s" % (s.FILTER, f)) # should all also work if we get already PASSed input n_passed = 0 for (i, s) in enumerate(snvs): if s.FILTER == '.': snvs[i] = s._replace(FILTER="PASS") n_passed += 1 LOG.info("%d SNVs passed all filters." % n_passed) # remove temporary markup for tmpkey in tmp_vcf_markup: for s in snvs: if s.INFO.has_key(tmpkey): del s.INFO[tmpkey] if opts.pass_only: snvs = (s for s in snvs if s.FILTER == 'PASS') if opts.vcf_out == '-': fh_out = sys.stdout else: if opts.vcf_out[-3:] == '.gz': fh_out = gzip.open(opts.vcf_out, 'w') else: fh_out = open(opts.vcf_out, 'w') vcf_writer = vcf.VCFWriter(fh_out) vcf_writer.meta_from_reader(vcf_reader) vcf_writer.write(snvs) if fh_out != sys.stdout: fh_out.close()