示例#1
0
def main():
    """The main function
    """

    parser = cmdline_parser()
    args = parser.parse_args()

    if args.verbose:
        LOG.setLevel(logging.INFO)
    if args.debug:
        LOG.setLevel(logging.DEBUG)

    assert os.path.exists(args.bam), ("BAM file %s does not exist" % args.bam)
    samfh = pysam.Samfile(args.bam)

    # setup vcf_reader
    #
    if args.vcfin == '-':
        vcf_reader = vcf.VCFReader(sys.stdin)
    else:
        vcf_reader = vcf.VCFReader(filename=args.vcfin)

    variants = [r for r in vcf_reader]
    LOG.info("Loaded %d variants" % len(variants))

    if args.mtc.lower() != 'None':
        LOG.info("Will use %s for MTC on %s with alpha %f" %
                 (args.mtc, args.mtc_tag, args.mtc_alpha))
    else:
        LOG.info("No multiple testing correction will be done")

    # setup vcf_writer
    #
    if args.vcfout == '-':
        fh_out = sys.stdout
    else:
        if os.path.exists(args.vcfout):
            LOG.fatal(
                "Cowardly refusing to overwrite already existing file %s" %
                (args.vcfout))
            sys.exit(1)

        if args.vcfout[-3:] == '.gz':
            fh_out = gzip.open(args.vcfout, 'w')
        else:
            fh_out = open(args.vcfout, 'w')
    # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf clone didn't
    vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep)
    #vcf_writer = vcf.VCFWriter(fh_out)
    #vcf_writer.meta_from_reader(vcf_reader)

    pvalues = []
    for (var_no, var) in enumerate(variants):
        if var_no % 500 == 1:
            LOG.info("Computing bias for var %d of %d" %
                     (var_no, len(variants)))

        if var.INFO.has_key('INDEL'):
            LOG.warn("Skipping unsupported indel variant %s:%d" %
                     (var.CHROM, var.POS))
            continue

        reads = list(
            samfh.fetch(reference=var.CHROM, start=var.POS - 1, end=var.POS))
        LOG.debug("%s %d: %d (unfiltered) reads covering position" %
                  (var.CHROM, var.POS, len(reads)))

        ref_mquals = []
        alt_mquals = []
        ref_bquals = []
        alt_bquals = []
        # only for PE
        #ref_isize = []
        #alt_isize = []
        # following two meant to test
        #alt_vpos = []
        #rlens = []

        for r in reads:

            if skip_read(r):
                continue

            orphan = (r.flag & 0x1) and not (r.flag & 0x2)
            if orphan and not args.use_orphan:
                continue

            if r.mapq < args.min_mq:
                continue

            vpos_on_read = [
                vpos_on_read for (vpos_on_read, vpos_on_ref) in r.aligned_pairs
                if vpos_on_ref == var.POS - 1
            ]
            assert len(vpos_on_read) == 1
            vpos_on_read = vpos_on_read[0]
            if vpos_on_read == None:  # skip deletions
                continue

            #alt_vpos.append(vpos_on_read)
            #rlens.append(r.rlen)

            b = r.query[vpos_on_read]
            bq = ord(r.qqual[vpos_on_read]) - 33
            mq = r.mapq

            if bq < args.min_bq:
                continue

            assert len(var.REF) == 1 and len(var.ALT) == 1
            if b.upper() == var.REF[0].upper():
                ref_mquals.append(mq)
                ref_bquals.append(bq)
                #if not args.use_orphan:
                #    ref_isize.append(abs(r.tlen))
            elif b.upper() == str(var.ALT[0]).upper():
                alt_mquals.append(mq)
                alt_bquals.append(bq)
                #if not args.use_orphan:
                #    alt_isize.append(abs(r.tlen))
            else:
                LOG.debug("Skipping non-ref-alt base %s at %s:%d" %
                          (b, var.CHROM, var.POS))
                continue

        LOG.debug("After filtering at %s:%d: %d ref mquals and %d alt mquals" %
                  (var.CHROM, var.POS, len(ref_mquals), len(alt_mquals)))

        # mannwhitneyu fails if all values the same
        if len(set(ref_mquals).union(alt_mquals)) == 1:
            m_pv = 1.0
        elif len(ref_mquals) == 0 or len(alt_mquals) == 0:
            m_pv = 1.0
        else:
            # compute only if alternate quals are smaller on average
            if mean(alt_mquals) < mean(ref_mquals):
                ustat = mannwhitneyu(ref_mquals, alt_mquals)
                m_pv = ustat[1]
            else:
                m_pv = 1.0

        # same for bqs
        if len(set(ref_bquals).union(alt_bquals)) == 1:
            b_pv = 1.0
        elif len(ref_bquals) == 0 or len(alt_bquals) == 0:
            b_pv = 1.0
        else:
            if mean(alt_bquals) < mean(ref_bquals):
                ustat = mannwhitneyu(ref_bquals, alt_bquals)
                b_pv = ustat[1]
            else:
                b_pv = 1.0
        # same for isize-qs
        #if len(ref_isize) and len(alt_isize):
        #    if len(set(ref_isize).union(alt_isize))==1:
        #        i_pv = 1
        #    else:
        #        ustat = mannwhitneyu(ref_isize, alt_isize)
        #        i_pv = ustat[1]
        #else:
        #    i_pv = 1

        c_pv = fisher_comb(m_pv, b_pv)

        #import pdb; pdb.set_trace()
        LOG.debug("%s %d: mb %f bb %f cb %f" %
                  (var.CHROM, var.POS, m_pv, b_pv, c_pv))

        var.INFO['MB'] = prob_to_phredqual(m_pv)
        var.INFO['BB'] = prob_to_phredqual(b_pv)
        #var.INFO['IB'] = prob_to_phredqual(i_pv)
        var.INFO['CB'] = prob_to_phredqual(c_pv)

        if args.mtc.lower() != 'none':
            pvalues.append(phredqual_to_prob(int(var.INFO[args.mtc_tag])))

    if args.mtc.lower() != 'none':

        ftag = "%s<%f" % (args.mtc, args.mtc_alpha)
        rej_idxs = []
        if args.mtc == 'bonf':
            rej_idxs = [
                i for (i, p) in enumerate(
                    multiple_testing.Bonferroni(pvalues).corrected_pvals)
                if p < args.mtc_alpha
            ]

        elif args.mtc == 'holmbonf':
            rej_idxs = [
                i for (i, p) in enumerate(
                    multiple_testing.Bonferroni(pvalues).corrected_pvals)
                if p < args.mtc_alpha
            ]

        elif args.mtc == 'fdr':
            rej_idxs = fdr.fdr(pvalues, a=args.mtc_alpha)

        else:
            raise ValueError("unknown MTC method %s" % args.mtc)

        for i in rej_idxs:
            # pyvcf filter is empty if not set. lofreq's vcf clone was . or PASS
            #if not variants[i].FILTER or variants[i].FILTER in [".", "PASS"]:
            #    new_f = [ftag]
            #else:
            #    new_f = "%s;%s" % (variants[i].FILTER, ftag)
            #variants[i] = variants[i]._replace(FILTER=new_f)
            variants[i].FILTER.append(ftag)

        LOG.info("%d of %d variants didn't pass filter" %
                 (len(rej_idxs), len(variants)))

    # pyvcf doesn't need write_metainfo or write_header
    #vcf_writer.write_metainfo()
    #vcf_writer.write_header()
    for var in variants:
        filtered = len(var.FILTER) > 0 and var.FILTER not in [".", "PASS"]
        if args.pass_only and filtered:
            continue
        # LoFreq's vcf clone called this write_rec()
        vcf_writer.write_record(var)

    if fh_out != sys.stdout:
        fh_out.close()
示例#2
0
def main():
    """main function
    """

    tmp_vcf_markup = []

    parser = cmdline_parser()

    # WARNING: undocumented arg to remove all defaults (and the reason
    # why we have to use OptParse)
    if '--no-defaults' in sys.argv:
        for (k, v) in parser.defaults.items():
            parser.defaults[k] = None
        sys.argv = [x for x in sys.argv if x != "--no-defaults"]

    (opts, args) = parser.parse_args()

    if len(args):
        parser.error("Unrecognized arguments found: %s." % (
            ' '.join(args)))
        sys.exit(1)


    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    for (in_file, descr) in [(opts.vcf_in, "VCF")]:
        if not in_file:
            parser.error("%s input file argument missing." % descr)
            sys.exit(1)
        if not os.path.exists(in_file) and in_file != "-":
            sys.stderr.write(
                "file '%s' does not exist.\n" % in_file)
            sys.exit(1)

    for (out_file, descr) in [(opts.vcf_out, "VCF output file")]:
        if not out_file:
            parser.error("%s output file argument missing." % descr)
            sys.exit(1)
        if os.path.exists(out_file) and out_file!="-":
            sys.stderr.write("Cowardly refusing to overwrite existing"
                             " output file '%s'.\n" % out_file)
            sys.exit(1)


    if opts.vcf_in == '-':
        vcf_reader = vcf.VCFReader(sys.stdin)
    else:
        if opts.vcf_in[-3:] == '.gz':
            vcf_reader = vcf.VCFReader(gzip.open(opts.vcf_in,'r'))
        else:
            vcf_reader = vcf.VCFReader(open(opts.vcf_in,'r'))
    snvs = [r for r in vcf_reader]
    LOG.info("Parsed %d SNVs from %s" % (len(snvs), opts.vcf_in))


    
    # list of tuples: first element is a filter func, which takes a
    # snv and a filter-id as input. second is the filter id. variant
    # will be marked as filtered if func returns True
    filters = []

    
    if opts.min_af != None:
        vcf_filter = vcf._Filter(
            id=("minaf%f" % opts.min_af).rstrip('0'),
            desc="Minimum allele frequency")
        vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

        filters.append((
            lambda s, f_id: f_id if s.INFO['AF'] < opts.min_af else None,
            vcf_filter.id
            ))


    if opts.max_cov != None:
        if not all([s.INFO.has_key('DP') for s in snvs]):
            LOG.error("At least one SNV was not annotated with depth info (DP)"
                      " (was this file produced with LoFreq?).")
            sys.exit(1)

        vcf_filter = vcf._Filter(
            id="maxcov%d" % opts.max_cov,
            desc="Maximum coverage")
        vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

        filters.append((
            lambda s, f_id: f_id if s.INFO['DP'] > opts.max_cov else None,
            vcf_filter.id
            ))


    if opts.min_cov != None:
        if not all([s.INFO.has_key('DP') for s in snvs]):
            LOG.error("At least one SNV was not annotated with depth info (DP)"
                      " (was this file produced with LoFreq?).")
            sys.exit(1)

        vcf_filter = vcf._Filter(
            id="mincov%d" % opts.min_cov,
            desc="Minimum coverage")
        vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

        filters.append((
            lambda s, f_id: f_id if s.INFO['DP'] < opts.min_cov else None,
            vcf_filter.id
            ))

    # structured as opts.snv_qual filtering, but keeps corrected
    # values.
    if opts.strandbias != None:

        if opts.strandbias in ['bonf', 'holm-bonf']:
            if not opts.strandbias_alpha:
                LOG.fatal("Need alpha/significance threshold for strandbias"
                          " multiple testing correction")
                sys.exit(1)

            vcf_filter = vcf._Filter(
                id="strandbias%s" % opts.strandbias.replace("-", ""),
                desc="Strand-bias filter (%s corrected < %g)" % (
                    opts.strandbias, opts.strandbias_alpha))
            vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

            if opts.strandbias == 'bonf':
                vcf_info_id = "SBBC"
            elif opts.strandbias == 'holm-bonf':
                vcf_info_id = "SBHBC"
            else:
                raise ValueError
            vcf_info = vcf._Info(
                id=vcf_info_id, num=1, type='Integer',
                desc="Strand-bias %s corrected" % opts.strandbias)
            vcf_reader.infos[vcf_info.id] = vcf_info

            try:
                pvals = (phredqual_to_prob(s.INFO['SB']) for s in snvs)
            except (KeyError, AssertionError) as e:
                LOG.error("At least one SNV was not annotated properly with"
                          " strandbias info (SB)"
                          " (was this file produced with LoFreq?)"
                          " You will need to switch strandbias filtering off")
                sys.exit(1)

            if opts.strandbias == 'bonf':
                corr_pvals = multiple_testing.Bonferroni(
                    pvals).corrected_pvals
            elif opts.strandbias == 'holm-bonf':
                corr_pvals = multiple_testing.HolmBonferroni(
                    pvals).corrected_pvals
            else:
                raise ValueError
            for (cp, s) in zip(corr_pvals, snvs):
                s.INFO[vcf_info.id] = prob_to_phredqual(cp)
                if s.INFO[vcf_info.id] > MAX_INT:
                    s.INFO[vcf_info.id] = MAX_INT

            filters.append((
                lambda s, f_id: f_id if s.INFO[vcf_info.id] > prob_to_phredqual(opts.strandbias_alpha) else None,
                vcf_filter.id
                ))

        # int
        elif opts.strandbias != 'off':
            try:
                max_strandbias_phred = int(opts.strandbias)
                assert max_strandbias_phred >= 0
            except (ValueError, AssertionError) as e:
                LOG.fatal("Invalid strandbias argument: %s" % (opts.strandbias))
                sys.exit(1)

            vcf_filter = vcf._Filter(
                max_strandbias_phred = int(
                id="sbp%d" % opts.max_strandbias_phred,
                desc="Phred-based strand-bias filter (max)"))
            vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

            filters.append((
                lambda s, f_id: f_id if float(s.INFO['SB']) > opts.max_strandbias_phred else None,
                vcf_filter.id
                ))
            

    # structured as opts.strandbias filtering, but doesn't keep
    # corrected values.
    if opts.snv_qual != None:

        if opts.snv_qual in ['bonf', 'holm-bonf', 'fdr']:
            if not opts.snv_qual_alpha:
                LOG.fatal("Need alpha/significance threshold for snv quality"
                          " multiple testing correction")
                sys.exit(1)

            vcf_filter = vcf._Filter(
                id="snvqual%s" % opts.snv_qual.replace("-", ""),
                desc="SNV quality filter (%s corrected < %g)" % (
                    opts.snv_qual, opts.snv_qual_alpha))
            vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

            vcf_info_id = "SNVQUALPASS" # tmp markup
            tmp_vcf_markup.append(vcf_info_id)

            pvals = []
            pidx = []
            for (i, s) in enumerate(snvs):
                # if qual is not NA, convert to pvalue, else don't
                # use filter (set filter to NA)
                if s.QUAL != '.':
                    pvals.append(phredqual_to_prob(s.QUAL))
                    pidx.append(i)
                    s.INFO[vcf_info_id] = 0
                else:
                    s.INFO[vcf_info_id] = '.'

            if opts.snv_qual == 'bonf':
                for (i, p) in enumerate(
                        multiple_testing.Bonferroni(
                            pvals, n=opts.snv_qual_numtests).corrected_pvals):
                    if p <= opts.snv_qual_alpha:
                        snvs[pidx[i]].INFO[vcf_info_id] = 1

            elif opts.snv_qual == 'holm-bonf':
                for (i, p) in enumerate(
                        multiple_testing.HolmBonferroni(
                            pvals, n=opts.snv_qual_numtests).corrected_pvals):
                    if p <= opts.snv_qual_alpha:
                        snvs[pidx[i]].INFO[vcf_info_id] = 1
 
            elif opts.snv_qual == 'fdr':
                for i in fdr.fdr(pvals, a=opts.snv_qual_alpha, 
                                 n=opts.snv_qual_numtests):
                    snvs[pidx[i]].INFO[vcf_info_id] = 1

            else:
                raise ValueError

            filters.append((
                lambda s, f_id: f_id if s.INFO[vcf_info_id] != '.' and s.INFO[vcf_info_id] == 0 else None,
                vcf_filter.id
                ))

        elif opts.snv_qual != 'off':
            try:
                min_qual = int(opts.snv_qual)
                assert min_qual >= 0
            except (ValueError, AssertionError) as e:
                LOG.fatal("Invalid snv quality argument: %s" % (opts.snv_qual))
                sys.exit(1)

            vcf_filter = vcf._Filter(
                id="minqual%d" % min_qual,
                desc="Minimum SNV quality")
            vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

            filters.append((
                lambda s, f_id: f_id if s.QUAL != '.' and s.QUAL < min_qual else None,
                vcf_filter.id
                ))


    if opts.window_size != None:
        vcf_filter = vcf._Filter(
            id="snvwin%d" % opts.window_size,
            desc="SNV window filter (SNVs within %d bp distance)" % (
                opts.window_size))
        vcf_reader.filters[vcf_filter.id] = vcf_filter# reader serves as template for writer

        vcf_info_id = "SNVWINPASS" # tmp markup
        tmp_vcf_markup.append(vcf_info_id)

        snvs_on_cur_chrom = []
        last_chrom = None
        seen_chroms = []
        for (i, cur_snv) in enumerate(snvs): # assumes snvs are sorted by chrom
            if i == 0:
                last_chrom = cur_snv.CHROM
                
            if cur_snv.CHROM != last_chrom:
                assert cur_snv.CHROM not in seen_chroms, (
                    "SNV input not ordered by chromosome."
                    " Sure this file was procuced by LoFreq?")
                win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id)
                seen_chroms.append(last_chrom)
                last_chrom = cur_snv.CHROM
                snvs_on_cur_chrom = [cur_snv]
                
            else:
                snvs_on_cur_chrom.append(cur_snv)

        # don't forget last chrom
        win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id)

        
        filters.append((
            lambda s, f_id: f_id if s.INFO[vcf_info_id] != '.' and s.INFO[vcf_info_id] == 0 else None,
            vcf_filter.id
            ))
            

    # The actual filtering: if filter function returns 1 the
    # corresponding snv has to be filtered
    #
    # FIXME can't this be done easier with map()?
    #
    if len(filters) == 0:
        LOG.error("No filters activated.")
        sys.exit(1)

    #import pdb; pdb.set_trace()
    for (filter_func, filter_id) in filters:
        for (i, s) in enumerate(snvs):
            f = filter_func(s, filter_id)
            if f:
                # just s = s.__replace() can't work
                if s.FILTER == '.' or s.FILTER == 'PASS':
                    snvs[i] = s._replace(FILTER=f)
                else:
                    snvs[i] = s._replace(FILTER="%s;%s" % (s.FILTER, f))

                        
    
    # should all also work if we get already PASSed input

    n_passed = 0
    for (i, s) in enumerate(snvs):
        if s.FILTER == '.':
            snvs[i] = s._replace(FILTER="PASS")
            n_passed += 1
    LOG.info("%d SNVs passed all filters." % n_passed)

    # remove temporary markup
    for tmpkey in tmp_vcf_markup:
        for s in snvs:
            if s.INFO.has_key(tmpkey):
                del s.INFO[tmpkey]

    if opts.pass_only:
        snvs = (s for s in snvs if s.FILTER == 'PASS')

    if opts.vcf_out == '-':
        fh_out = sys.stdout
    else:
        if opts.vcf_out[-3:] == '.gz':
            fh_out = gzip.open(opts.vcf_out, 'w')
        else:
            fh_out = open(opts.vcf_out, 'w')

    vcf_writer = vcf.VCFWriter(fh_out)
    vcf_writer.meta_from_reader(vcf_reader)
    vcf_writer.write(snvs)

    if fh_out != sys.stdout:
        fh_out.close()
示例#3
0
def main():
    """The main function
    """
    
    parser = cmdline_parser()
    args = parser.parse_args()
    
    if args.verbose:
        LOG.setLevel(logging.INFO)
    if args.debug:
        LOG.setLevel(logging.DEBUG)        
    
    assert os.path.exists(args.bam), (
        "BAM file %s does not exist" % args.bam)
    samfh = pysam.Samfile(args.bam)

    # setup vcf_reader
    # 
    if args.vcfin == '-':
        vcf_reader = vcf.VCFReader(sys.stdin)
    else:
        vcf_reader = vcf.VCFReader(filename=args.vcfin)
            
    variants = [r for r in vcf_reader]
    LOG.info("Loaded %d variants" % len(variants))
    
    if args.mtc.lower() != 'None':
        LOG.info("Will use %s for MTC on %s with alpha %f" % (
            args.mtc, args.mtc_tag, args.mtc_alpha))
    else:
        LOG.info("No multiple testing correction will be done")
        
    # setup vcf_writer
    #
    if args.vcfout == '-':
        fh_out = sys.stdout
    else:
        if os.path.exists(args.vcfout):
            LOG.fatal("Cowardly refusing to overwrite already existing file %s" % (args.vcfout))
            sys.exit(1)
            
        if args.vcfout[-3:] == '.gz':
            fh_out = gzip.open(args.vcfout, 'w')
        else:
            fh_out = open(args.vcfout, 'w')
    # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf clone didn't
    vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep)
    #vcf_writer = vcf.VCFWriter(fh_out)
    #vcf_writer.meta_from_reader(vcf_reader)
                                       
    pvalues = []
    for (var_no, var) in enumerate(variants):
        if var_no%500==1:
            LOG.info("Computing bias for var %d of %d" % (var_no, len(variants)))
            
        if var.INFO.has_key('INDEL'):
            LOG.warn("Skipping unsupported indel variant %s:%d" % (var.CHROM, var.POS))
            continue
        
        reads = list(samfh.fetch(reference=var.CHROM,
                                 start=var.POS-1, end=var.POS))
        LOG.debug("%s %d: %d (unfiltered) reads covering position" % (
           var.CHROM, var.POS, len(reads)))

        ref_mquals = []
        alt_mquals = []
        ref_bquals = []
        alt_bquals = []
        # only for PE
        #ref_isize = []
        #alt_isize = []
        # following two meant to test
        #alt_vpos = [] 
        #rlens = []
        
        for r in reads:

            if skip_read(r):
                continue
                
            orphan = (r.flag & 0x1) and not (r.flag & 0x2)
            if orphan and not args.use_orphan:
                continue

            if r.mapq < args.min_mq:
                continue
        
            vpos_on_read = [vpos_on_read 
                            for (vpos_on_read, vpos_on_ref) in r.aligned_pairs 
                            if vpos_on_ref==var.POS-1]
            assert len(vpos_on_read)==1
            vpos_on_read = vpos_on_read[0]
            if vpos_on_read == None:# skip deletions
                continue

            #alt_vpos.append(vpos_on_read)
            #rlens.append(r.rlen)
            
            b = r.query[vpos_on_read]
            bq = ord(r.qqual[vpos_on_read])-33
            mq = r.mapq

            if bq < args.min_bq:
                continue
            
            assert len(var.REF)==1 and len(var.ALT)==1
            if b.upper() == var.REF[0].upper():
                ref_mquals.append(mq)
                ref_bquals.append(bq)
                #if not args.use_orphan:
                #    ref_isize.append(abs(r.tlen))
            elif b.upper() == str(var.ALT[0]).upper():
                alt_mquals.append(mq)
                alt_bquals.append(bq)
                #if not args.use_orphan:
                #    alt_isize.append(abs(r.tlen))
            else:            
                LOG.debug("Skipping non-ref-alt base %s at %s:%d" % (b, var.CHROM, var.POS))
                continue
            
        LOG.debug("After filtering at %s:%d: %d ref mquals and %d alt mquals" % (
            var.CHROM, var.POS, len(ref_mquals), len(alt_mquals)))
        
        # mannwhitneyu fails if all values the same
        if len(set(ref_mquals).union(alt_mquals))==1:
            m_pv = 1.0
        elif len(ref_mquals)==0 or len(alt_mquals)==0:
            m_pv = 1.0
        else:
            # compute only if alternate quals are smaller on average
            if mean(alt_mquals) < mean(ref_mquals):
                ustat = mannwhitneyu(ref_mquals, alt_mquals)
                m_pv = ustat[1]
            else:
                m_pv = 1.0

        # same for bqs
        if len(set(ref_bquals).union(alt_bquals))==1:
            b_pv = 1.0
        elif len(ref_bquals)==0 or len(alt_bquals)==0:
            b_pv = 1.0
        else:
            if mean(alt_bquals) < mean(ref_bquals):
                ustat = mannwhitneyu(ref_bquals, alt_bquals)
                b_pv = ustat[1]
            else:
                b_pv = 1.0
        # same for isize-qs
        #if len(ref_isize) and len(alt_isize):
        #    if len(set(ref_isize).union(alt_isize))==1:
        #        i_pv = 1
        #    else:
        #        ustat = mannwhitneyu(ref_isize, alt_isize)
        #        i_pv = ustat[1]
        #else:
        #    i_pv = 1
        
        c_pv = fisher_comb(m_pv, b_pv)
            
        #import pdb; pdb.set_trace()
        LOG.debug("%s %d: mb %f bb %f cb %f" % (var.CHROM, var.POS, m_pv, b_pv, c_pv))

        var.INFO['MB'] = prob_to_phredqual(m_pv)
        var.INFO['BB'] = prob_to_phredqual(b_pv)
        #var.INFO['IB'] = prob_to_phredqual(i_pv)
        var.INFO['CB'] = prob_to_phredqual(c_pv)

        if args.mtc.lower() != 'none':
            pvalues.append(phredqual_to_prob(int(var.INFO[args.mtc_tag])))
                       

    if args.mtc.lower() != 'none':
    
        ftag = "%s<%f" % (args.mtc, args.mtc_alpha)
        rej_idxs = []
        if args.mtc == 'bonf':
            rej_idxs = [i for (i, p) in
                       enumerate(multiple_testing.Bonferroni(pvalues).corrected_pvals) 
                       if p<args.mtc_alpha]
            
        elif args.mtc == 'holmbonf':
            rej_idxs = [i for (i, p) in
                       enumerate(multiple_testing.Bonferroni(pvalues).corrected_pvals) 
                       if p<args.mtc_alpha]
                    
        elif args.mtc == 'fdr':
            rej_idxs = fdr.fdr(pvalues, a=args.mtc_alpha)
    
        else:
            raise ValueError("unknown MTC method %s" % args.mtc)

        for i in rej_idxs:
            # pyvcf filter is empty if not set. lofreq's vcf clone was . or PASS
            #if not variants[i].FILTER or variants[i].FILTER in [".", "PASS"]:
            #    new_f = [ftag]
            #else:
            #    new_f = "%s;%s" % (variants[i].FILTER, ftag)
            #variants[i] = variants[i]._replace(FILTER=new_f)
            variants[i].FILTER.append(ftag)
    
        LOG.info("%d of %d variants didn't pass filter" % (
            len(rej_idxs), len(variants)))

    # pyvcf doesn't need write_metainfo or write_header
    #vcf_writer.write_metainfo()
    #vcf_writer.write_header()
    for var in variants:
        filtered = len(var.FILTER)>0 and var.FILTER not in [".", "PASS"]
        if args.pass_only and filtered:
            continue
        # LoFreq's vcf clone called this write_rec()
        vcf_writer.write_record(var)
    
    if fh_out != sys.stdout:
        fh_out.close()
示例#4
0
def main():
    """main function
    """

    tmp_vcf_markup = []

    parser = cmdline_parser()

    # WARNING: undocumented arg to remove all defaults (and the reason
    # why we have to use OptParse)
    if '--no-defaults' in sys.argv:
        for (k, v) in parser.defaults.items():
            parser.defaults[k] = None
        sys.argv = [x for x in sys.argv if x != "--no-defaults"]

    (opts, args) = parser.parse_args()

    if len(args):
        parser.error("Unrecognized arguments found: %s." % (' '.join(args)))
        sys.exit(1)

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    for (in_file, descr) in [(opts.vcf_in, "VCF")]:
        if not in_file:
            parser.error("%s input file argument missing." % descr)
            sys.exit(1)
        if not os.path.exists(in_file) and in_file != "-":
            sys.stderr.write("file '%s' does not exist.\n" % in_file)
            sys.exit(1)

    for (out_file, descr) in [(opts.vcf_out, "VCF output file")]:
        if not out_file:
            parser.error("%s output file argument missing." % descr)
            sys.exit(1)
        if os.path.exists(out_file) and out_file != "-":
            sys.stderr.write("Cowardly refusing to overwrite existing"
                             " output file '%s'.\n" % out_file)
            sys.exit(1)

    if opts.vcf_in == '-':
        vcf_reader = vcf.VCFReader(sys.stdin)
    else:
        if opts.vcf_in[-3:] == '.gz':
            vcf_reader = vcf.VCFReader(gzip.open(opts.vcf_in, 'r'))
        else:
            vcf_reader = vcf.VCFReader(open(opts.vcf_in, 'r'))
    snvs = [r for r in vcf_reader]
    LOG.info("Parsed %d SNVs from %s" % (len(snvs), opts.vcf_in))

    # list of tuples: first element is a filter func, which takes a
    # snv and a filter-id as input. second is the filter id. variant
    # will be marked as filtered if func returns True
    filters = []

    if opts.min_af != None:
        vcf_filter = vcf._Filter(id=("minaf%f" % opts.min_af).rstrip('0'),
                                 desc="Minimum allele frequency")
        vcf_reader.filters[
            vcf_filter.id] = vcf_filter  # reader serves as template for writer

        filters.append(
            (lambda s, f_id: f_id
             if s.INFO['AF'] < opts.min_af else None, vcf_filter.id))

    if opts.max_cov != None:
        if not all([s.INFO.has_key('DP') for s in snvs]):
            LOG.error("At least one SNV was not annotated with depth info (DP)"
                      " (was this file produced with LoFreq?).")
            sys.exit(1)

        vcf_filter = vcf._Filter(id="maxcov%d" % opts.max_cov,
                                 desc="Maximum coverage")
        vcf_reader.filters[
            vcf_filter.id] = vcf_filter  # reader serves as template for writer

        filters.append(
            (lambda s, f_id: f_id
             if s.INFO['DP'] > opts.max_cov else None, vcf_filter.id))

    if opts.min_cov != None:
        if not all([s.INFO.has_key('DP') for s in snvs]):
            LOG.error("At least one SNV was not annotated with depth info (DP)"
                      " (was this file produced with LoFreq?).")
            sys.exit(1)

        vcf_filter = vcf._Filter(id="mincov%d" % opts.min_cov,
                                 desc="Minimum coverage")
        vcf_reader.filters[
            vcf_filter.id] = vcf_filter  # reader serves as template for writer

        filters.append(
            (lambda s, f_id: f_id
             if s.INFO['DP'] < opts.min_cov else None, vcf_filter.id))

    # structured as opts.snv_qual filtering, but keeps corrected
    # values.
    if opts.strandbias != None:

        if opts.strandbias in ['bonf', 'holm-bonf']:
            if not opts.strandbias_alpha:
                LOG.fatal("Need alpha/significance threshold for strandbias"
                          " multiple testing correction")
                sys.exit(1)

            vcf_filter = vcf._Filter(
                id="strandbias%s" % opts.strandbias.replace("-", ""),
                desc="Strand-bias filter (%s corrected < %g)" %
                (opts.strandbias, opts.strandbias_alpha))
            vcf_reader.filters[
                vcf_filter.
                id] = vcf_filter  # reader serves as template for writer

            if opts.strandbias == 'bonf':
                vcf_info_id = "SBBC"
            elif opts.strandbias == 'holm-bonf':
                vcf_info_id = "SBHBC"
            else:
                raise ValueError
            vcf_info = vcf._Info(id=vcf_info_id,
                                 num=1,
                                 type='Integer',
                                 desc="Strand-bias %s corrected" %
                                 opts.strandbias)
            vcf_reader.infos[vcf_info.id] = vcf_info

            try:
                pvals = (phredqual_to_prob(s.INFO['SB']) for s in snvs)
            except (KeyError, AssertionError) as e:
                LOG.error("At least one SNV was not annotated properly with"
                          " strandbias info (SB)"
                          " (was this file produced with LoFreq?)"
                          " You will need to switch strandbias filtering off")
                sys.exit(1)

            if opts.strandbias == 'bonf':
                corr_pvals = multiple_testing.Bonferroni(pvals).corrected_pvals
            elif opts.strandbias == 'holm-bonf':
                corr_pvals = multiple_testing.HolmBonferroni(
                    pvals).corrected_pvals
            else:
                raise ValueError
            for (cp, s) in zip(corr_pvals, snvs):
                s.INFO[vcf_info.id] = prob_to_phredqual(cp)
                if s.INFO[vcf_info.id] > MAX_INT:
                    s.INFO[vcf_info.id] = MAX_INT

            filters.append(
                (lambda s, f_id: f_id
                 if s.INFO[vcf_info.id] > prob_to_phredqual(
                     opts.strandbias_alpha) else None, vcf_filter.id))

        # int
        elif opts.strandbias != 'off':
            try:
                max_strandbias_phred = int(opts.strandbias)
                assert max_strandbias_phred >= 0
            except (ValueError, AssertionError) as e:
                LOG.fatal("Invalid strandbias argument: %s" %
                          (opts.strandbias))
                sys.exit(1)

            vcf_filter = vcf._Filter(max_strandbias_phred=int(
                id="sbp%d" % opts.max_strandbias_phred,
                desc="Phred-based strand-bias filter (max)"))
            vcf_reader.filters[
                vcf_filter.
                id] = vcf_filter  # reader serves as template for writer

            filters.append(
                (lambda s, f_id: f_id
                 if float(s.INFO['SB']) > opts.max_strandbias_phred else None,
                 vcf_filter.id))

    # structured as opts.strandbias filtering, but doesn't keep
    # corrected values.
    if opts.snv_qual != None:

        if opts.snv_qual in ['bonf', 'holm-bonf', 'fdr']:
            if not opts.snv_qual_alpha:
                LOG.fatal("Need alpha/significance threshold for snv quality"
                          " multiple testing correction")
                sys.exit(1)

            vcf_filter = vcf._Filter(
                id="snvqual%s" % opts.snv_qual.replace("-", ""),
                desc="SNV quality filter (%s corrected < %g)" %
                (opts.snv_qual, opts.snv_qual_alpha))
            vcf_reader.filters[
                vcf_filter.
                id] = vcf_filter  # reader serves as template for writer

            vcf_info_id = "SNVQUALPASS"  # tmp markup
            tmp_vcf_markup.append(vcf_info_id)

            pvals = []
            pidx = []
            for (i, s) in enumerate(snvs):
                # if qual is not NA, convert to pvalue, else don't
                # use filter (set filter to NA)
                if s.QUAL != '.':
                    pvals.append(phredqual_to_prob(s.QUAL))
                    pidx.append(i)
                    s.INFO[vcf_info_id] = 0
                else:
                    s.INFO[vcf_info_id] = '.'

            if opts.snv_qual == 'bonf':
                for (i, p) in enumerate(
                        multiple_testing.Bonferroni(
                            pvals, n=opts.snv_qual_numtests).corrected_pvals):
                    if p <= opts.snv_qual_alpha:
                        snvs[pidx[i]].INFO[vcf_info_id] = 1

            elif opts.snv_qual == 'holm-bonf':
                for (i, p) in enumerate(
                        multiple_testing.HolmBonferroni(
                            pvals, n=opts.snv_qual_numtests).corrected_pvals):
                    if p <= opts.snv_qual_alpha:
                        snvs[pidx[i]].INFO[vcf_info_id] = 1

            elif opts.snv_qual == 'fdr':
                for i in fdr.fdr(pvals,
                                 a=opts.snv_qual_alpha,
                                 n=opts.snv_qual_numtests):
                    snvs[pidx[i]].INFO[vcf_info_id] = 1

            else:
                raise ValueError

            filters.append((lambda s, f_id: f_id
                            if s.INFO[vcf_info_id] != '.' and s.INFO[
                                vcf_info_id] == 0 else None, vcf_filter.id))

        elif opts.snv_qual != 'off':
            try:
                min_qual = int(opts.snv_qual)
                assert min_qual >= 0
            except (ValueError, AssertionError) as e:
                LOG.fatal("Invalid snv quality argument: %s" % (opts.snv_qual))
                sys.exit(1)

            vcf_filter = vcf._Filter(id="minqual%d" % min_qual,
                                     desc="Minimum SNV quality")
            vcf_reader.filters[
                vcf_filter.
                id] = vcf_filter  # reader serves as template for writer

            filters.append((lambda s, f_id: f_id
                            if s.QUAL != '.' and s.QUAL < min_qual else None,
                            vcf_filter.id))

    if opts.window_size != None:
        vcf_filter = vcf._Filter(
            id="snvwin%d" % opts.window_size,
            desc="SNV window filter (SNVs within %d bp distance)" %
            (opts.window_size))
        vcf_reader.filters[
            vcf_filter.id] = vcf_filter  # reader serves as template for writer

        vcf_info_id = "SNVWINPASS"  # tmp markup
        tmp_vcf_markup.append(vcf_info_id)

        snvs_on_cur_chrom = []
        last_chrom = None
        seen_chroms = []
        for (i,
             cur_snv) in enumerate(snvs):  # assumes snvs are sorted by chrom
            if i == 0:
                last_chrom = cur_snv.CHROM

            if cur_snv.CHROM != last_chrom:
                assert cur_snv.CHROM not in seen_chroms, (
                    "SNV input not ordered by chromosome."
                    " Sure this file was procuced by LoFreq?")
                win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id)
                seen_chroms.append(last_chrom)
                last_chrom = cur_snv.CHROM
                snvs_on_cur_chrom = [cur_snv]

            else:
                snvs_on_cur_chrom.append(cur_snv)

        # don't forget last chrom
        win_filter(snvs_on_cur_chrom, opts.window_size, vcf_info_id)

        filters.append((lambda s, f_id: f_id if s.INFO[vcf_info_id] != '.' and
                        s.INFO[vcf_info_id] == 0 else None, vcf_filter.id))

    # The actual filtering: if filter function returns 1 the
    # corresponding snv has to be filtered
    #
    # FIXME can't this be done easier with map()?
    #
    if len(filters) == 0:
        LOG.error("No filters activated.")
        sys.exit(1)

    #import pdb; pdb.set_trace()
    for (filter_func, filter_id) in filters:
        for (i, s) in enumerate(snvs):
            f = filter_func(s, filter_id)
            if f:
                # just s = s.__replace() can't work
                if s.FILTER == '.' or s.FILTER == 'PASS':
                    snvs[i] = s._replace(FILTER=f)
                else:
                    snvs[i] = s._replace(FILTER="%s;%s" % (s.FILTER, f))

    # should all also work if we get already PASSed input

    n_passed = 0
    for (i, s) in enumerate(snvs):
        if s.FILTER == '.':
            snvs[i] = s._replace(FILTER="PASS")
            n_passed += 1
    LOG.info("%d SNVs passed all filters." % n_passed)

    # remove temporary markup
    for tmpkey in tmp_vcf_markup:
        for s in snvs:
            if s.INFO.has_key(tmpkey):
                del s.INFO[tmpkey]

    if opts.pass_only:
        snvs = (s for s in snvs if s.FILTER == 'PASS')

    if opts.vcf_out == '-':
        fh_out = sys.stdout
    else:
        if opts.vcf_out[-3:] == '.gz':
            fh_out = gzip.open(opts.vcf_out, 'w')
        else:
            fh_out = open(opts.vcf_out, 'w')

    vcf_writer = vcf.VCFWriter(fh_out)
    vcf_writer.meta_from_reader(vcf_reader)
    vcf_writer.write(snvs)

    if fh_out != sys.stdout:
        fh_out.close()