Exemplo n.º 1
0
def testCounts(counts, snp):
    winningParent='?'
    snpchr, snppos, snprec = snp
    mat_genotype, pat_genotype, child_genotype, mat_allele, pat_allele, typ, ref, hetSNP = snprec

    # first, make sure that the expected alleles are the bulk of the counts
    total = counts['a']+counts['c']+counts['g']+counts['t']
    a1,a2=convert(child_genotype)
    if a1==a2:
        allelecnts = counts[a1]
    else:
        allelecnts = counts[a1]+counts[a2]
        
    both=counts[a1]+counts[a2]

    sortedCounts=sorted([(counts['a'], 'a'), (counts['c'],'c'), (counts['g'], 'g'), (counts['t'], 't')], reverse=True)
    majorAllele=sortedCounts[0][1]

    smaller=min(counts[a1], counts[a2])
    #pval=binomialDist.cdf(smaller, both, 0.5)*2 # This had problems for large sample sizes.  Switched to using scipy
    pval = binom.binomtest(smaller, both, 0.5) # scipy.binom_test was unstable for large counts
    
    if float(allelecnts)/total < THRESH1:
        print >>LOGFP,  "WARNING %s:%d failed thresh 1 %d %d" % (snpchr, snppos, allelecnts, total)
        return (WEIRD, pval, a1, a2, counts, winningParent)

    # if the snp was phased
    if mat_allele and pat_allele:
        if mat_allele.lower()==majorAllele.lower():
            winningParent='M'
        elif pat_allele.lower()==majorAllele.lower():
            winningParent='P'
        else:
            winningParent='?'

    if a1!=a2:
        # we expect roughly 50/50.  
        if pval < THRESH2:
            print >>LOGFP,  "NOTE %s:%d Looks interesting: failed thresh 2 %d %d %f" % (snpchr, snppos, both, smaller, pval)
            print >>LOGFP,  "SNPS %s/%s, COUNTS a:%d c:%d g:%d t:%d" % (a1, a2, counts['a'], counts['c'], counts['g'], counts['t'])
            print >>LOGFP,  "Phasing P:%s M:%s D:%s" % (pat_allele, mat_allele, snprec)
            print >>LOGFP,  "\n"
            return (ASYMMETRIC, pval, a1, a2, counts, winningParent)
        else:
            return (SYMMETRIC, pval, a1, a2, counts, winningParent)
    else:
        return (HOMOZYGOUS, pval, a1, a2, counts, winningParent)
Exemplo n.º 2
0
                outwrite(
                    line,
                    weaker + ':' + str(mm_basecnts[weaker]) + ';within_thresh')

        # or adjust counts in the most conservative way: add the mm counts to the weaker allele:
        # all of them or until balanced with the stonger
        # to make sure the count imbalance is not caused by the multimapping reads
        else:
            adj = min((um_basecnts[weaker] + mm_basecnts[weaker]),
                      max(um_basecnts[hap1_allele], um_basecnts[hap2_allele]))
            diff = adj - um_basecnts[weaker]
            um_basecnts[weaker] = adj

            new_tot = int(um_basecnts[hap1_allele] + um_basecnts[hap2_allele])
            new_ratio = float(um_basecnts[ref_allele]) / float(new_tot)
            new_p_binom = binom.binomtest(um_basecnts[ref_allele], new_tot,
                                          0.5)

            sys.stdout.write('\t'.join([
                chrm, ref_coord, hap1_coord, hap2_coord, ref_allele,
                hap1_allele, hap2_allele,
                str(int(um_basecnts['A'])),
                str(int(um_basecnts['C'])),
                str(int(um_basecnts['G'])),
                str(int(um_basecnts['T'])),
                str(int(um_basecnts['N'])),
                str(new_ratio),
                str(new_tot),
                str(new_p_binom), cnv[:-1], weaker + ':+' + str(int(diff))
            ]) + '\n')

            logwrite(
 def __init__(self, n):
     self.n = n
     self.cache = [[binom.binomtest(j, i, 0.5) for j in range(i + 1)]
                   for i in range(n)]
 def binomtest(self, a, cnt):
     if cnt < self.n:
         return self.cache[cnt][a]
     else:
         return binom.binomtest(a, cnt, 0.5)
Exemplo n.º 5
0
 def binomtest(self, a, cnt):
     if cnt<self.n:
         return self.cache[cnt][a]
     else:
         return binom.binomtest(a, cnt, 0.5)
Exemplo n.º 6
0
 def __init__(self, n):
     self.n=n
     self.cache=[[binom.binomtest(j, i, 0.5) for j in range(i+1)] for i in range(n)]
Exemplo n.º 7
0
                    + k + '\n')
                rmvdhets_file.write(
                    '\t'.join(k.split('_') + ['vcf_vs_mpileup_alleles']) +
                    '\n')
                continue

            ref_cnt = basecnts[hetSNV_dict[k]['r_a']]
            if ref_cnt > tot_cnt:  # maybe a miscalled multi-allelic variant?
                rmvdhets_file.write(
                    k.split('_')[0] + '\t' + k.split('_')[1] +
                    '\tref_allele:' + str(ref_cnt) + '__hap1:' +
                    str(basecnts[hetSNV_dict[k]['hap1_a']]) + '__hap2:' +
                    str(basecnts[hetSNV_dict[k]['hap2_a']]) + '\n')
                continue

            pbinom = binom.binomtest(ref_cnt, tot_cnt, 0.5)
            sys.stdout.write('\t'.join([
                k.split('_')[0],
                k.split('_')[1], hetSNV_dict[k].get('hap1_pos', 'not_lifted?'),
                hetSNV_dict[k].get('hap2_pos', 'not_lifted?'), hetSNV_dict[k]
                ['r_a'], basecnts_hap1['hap1_a'], basecnts_hap2['hap2_a'],
                str(basecnts['A']),
                str(basecnts['C']),
                str(basecnts['G']),
                str(basecnts['T']),
                str(basecnts['N']),
                str(float(ref_cnt) / float(tot_cnt)),
                str(tot_cnt),
                str(pbinom), basecnts_hap1['warning'], basecnts_hap2['warning']
            ]) + '\n')
Exemplo n.º 8
0
with open(sys.argv[2], 'r') as in1:
    for line in in1:
        if line.split()[-1] in e_dict:
            e_dict[line.split()[-1]]['snv_count'] += 1

if sys.argv[1] == 'uniq':
    sys.stdout.write('\t'.join([
        '#region', 'hap1_count', 'hap2_count', 'hap1_allele_ratio', 'p_binom',
        'snv_count', 'snv_hap1_hap2_coords'
    ]) + '\n')

    for e in e_list:
        hap1_count = len(e_dict[e]['hap1'])
        hap2_count = len(e_dict[e]['hap2'])

        pbinom = binom.binomtest(hap1_count, hap1_count + hap2_count, 0.5)

        hap1_allele_ratio = float(hap1_count) / (float(hap1_count) +
                                                 float(hap2_count))

        sys.stdout.write('\t'.join([
            e,
            str(hap1_count),
            str(hap2_count),
            str(hap1_allele_ratio),
            str(pbinom),
            str(e_dict[e]['snv_count']), e_dict[e]['snvs']
        ]) + '\n')

elif sys.argv[1] == 'mmap':
    sys.stdout.write('\t'.join(['#region', 'hap1_count', 'hap2_count']) + '\n')
                    region, weaker_hap + ':' + str(weaker_hap_mm_count) +
                    ';within_thresh')

        # or adjust counts in the most conservative way: add the mm counts to the weaker allele:
        # all of them or until balanced with the stonger
        # to make sure the count imbalance is not caused by the multimapping reads
        else:
            adj = min((weaker_hap_un_count + weaker_hap_mm_count),
                      max(int(hap1_count), int(hap2_count)))
            diff = adj - weaker_hap_un_count
            if weaker_hap == 'hap1': hap1_count = adj
            else: hap2_count = adj

            new_tot = int(hap1_count) + int(hap2_count)
            new_hap1_allele_ratio = float(hap1_count) / float(new_tot)
            new_p_binom = binom.binomtest(int(hap1_count), new_tot, 0.5)

            sys.stdout.write('\t'.join([
                region,
                str(hap1_count),
                str(hap2_count),
                str(new_hap1_allele_ratio),
                str(new_p_binom), snv_count, snv_hap1_hap2_coords, weaker_hap +
                ':+' + str(int(diff))
            ]) + '\n')

            #logwrite(line, mm_counts_dict[region]['hap1_count'], mm_counts_dict[region]['hap2_count'], weaker_hap+':+'+str(int(diff)))
            logwrite(region, mm_counts_dict[region]['hap1_count'],
                     mm_counts_dict[region]['hap2_count'],
                     weaker_hap + ':+' + str(int(diff)))