def testCounts(counts, snp): winningParent='?' snpchr, snppos, snprec = snp mat_genotype, pat_genotype, child_genotype, mat_allele, pat_allele, typ, ref, hetSNP = snprec # first, make sure that the expected alleles are the bulk of the counts total = counts['a']+counts['c']+counts['g']+counts['t'] a1,a2=convert(child_genotype) if a1==a2: allelecnts = counts[a1] else: allelecnts = counts[a1]+counts[a2] both=counts[a1]+counts[a2] sortedCounts=sorted([(counts['a'], 'a'), (counts['c'],'c'), (counts['g'], 'g'), (counts['t'], 't')], reverse=True) majorAllele=sortedCounts[0][1] smaller=min(counts[a1], counts[a2]) #pval=binomialDist.cdf(smaller, both, 0.5)*2 # This had problems for large sample sizes. Switched to using scipy pval = binom.binomtest(smaller, both, 0.5) # scipy.binom_test was unstable for large counts if float(allelecnts)/total < THRESH1: print >>LOGFP, "WARNING %s:%d failed thresh 1 %d %d" % (snpchr, snppos, allelecnts, total) return (WEIRD, pval, a1, a2, counts, winningParent) # if the snp was phased if mat_allele and pat_allele: if mat_allele.lower()==majorAllele.lower(): winningParent='M' elif pat_allele.lower()==majorAllele.lower(): winningParent='P' else: winningParent='?' if a1!=a2: # we expect roughly 50/50. if pval < THRESH2: print >>LOGFP, "NOTE %s:%d Looks interesting: failed thresh 2 %d %d %f" % (snpchr, snppos, both, smaller, pval) print >>LOGFP, "SNPS %s/%s, COUNTS a:%d c:%d g:%d t:%d" % (a1, a2, counts['a'], counts['c'], counts['g'], counts['t']) print >>LOGFP, "Phasing P:%s M:%s D:%s" % (pat_allele, mat_allele, snprec) print >>LOGFP, "\n" return (ASYMMETRIC, pval, a1, a2, counts, winningParent) else: return (SYMMETRIC, pval, a1, a2, counts, winningParent) else: return (HOMOZYGOUS, pval, a1, a2, counts, winningParent)
outwrite( line, weaker + ':' + str(mm_basecnts[weaker]) + ';within_thresh') # or adjust counts in the most conservative way: add the mm counts to the weaker allele: # all of them or until balanced with the stonger # to make sure the count imbalance is not caused by the multimapping reads else: adj = min((um_basecnts[weaker] + mm_basecnts[weaker]), max(um_basecnts[hap1_allele], um_basecnts[hap2_allele])) diff = adj - um_basecnts[weaker] um_basecnts[weaker] = adj new_tot = int(um_basecnts[hap1_allele] + um_basecnts[hap2_allele]) new_ratio = float(um_basecnts[ref_allele]) / float(new_tot) new_p_binom = binom.binomtest(um_basecnts[ref_allele], new_tot, 0.5) sys.stdout.write('\t'.join([ chrm, ref_coord, hap1_coord, hap2_coord, ref_allele, hap1_allele, hap2_allele, str(int(um_basecnts['A'])), str(int(um_basecnts['C'])), str(int(um_basecnts['G'])), str(int(um_basecnts['T'])), str(int(um_basecnts['N'])), str(new_ratio), str(new_tot), str(new_p_binom), cnv[:-1], weaker + ':+' + str(int(diff)) ]) + '\n') logwrite(
def __init__(self, n): self.n = n self.cache = [[binom.binomtest(j, i, 0.5) for j in range(i + 1)] for i in range(n)]
def binomtest(self, a, cnt): if cnt < self.n: return self.cache[cnt][a] else: return binom.binomtest(a, cnt, 0.5)
def binomtest(self, a, cnt): if cnt<self.n: return self.cache[cnt][a] else: return binom.binomtest(a, cnt, 0.5)
def __init__(self, n): self.n=n self.cache=[[binom.binomtest(j, i, 0.5) for j in range(i+1)] for i in range(n)]
+ k + '\n') rmvdhets_file.write( '\t'.join(k.split('_') + ['vcf_vs_mpileup_alleles']) + '\n') continue ref_cnt = basecnts[hetSNV_dict[k]['r_a']] if ref_cnt > tot_cnt: # maybe a miscalled multi-allelic variant? rmvdhets_file.write( k.split('_')[0] + '\t' + k.split('_')[1] + '\tref_allele:' + str(ref_cnt) + '__hap1:' + str(basecnts[hetSNV_dict[k]['hap1_a']]) + '__hap2:' + str(basecnts[hetSNV_dict[k]['hap2_a']]) + '\n') continue pbinom = binom.binomtest(ref_cnt, tot_cnt, 0.5) sys.stdout.write('\t'.join([ k.split('_')[0], k.split('_')[1], hetSNV_dict[k].get('hap1_pos', 'not_lifted?'), hetSNV_dict[k].get('hap2_pos', 'not_lifted?'), hetSNV_dict[k] ['r_a'], basecnts_hap1['hap1_a'], basecnts_hap2['hap2_a'], str(basecnts['A']), str(basecnts['C']), str(basecnts['G']), str(basecnts['T']), str(basecnts['N']), str(float(ref_cnt) / float(tot_cnt)), str(tot_cnt), str(pbinom), basecnts_hap1['warning'], basecnts_hap2['warning'] ]) + '\n')
with open(sys.argv[2], 'r') as in1: for line in in1: if line.split()[-1] in e_dict: e_dict[line.split()[-1]]['snv_count'] += 1 if sys.argv[1] == 'uniq': sys.stdout.write('\t'.join([ '#region', 'hap1_count', 'hap2_count', 'hap1_allele_ratio', 'p_binom', 'snv_count', 'snv_hap1_hap2_coords' ]) + '\n') for e in e_list: hap1_count = len(e_dict[e]['hap1']) hap2_count = len(e_dict[e]['hap2']) pbinom = binom.binomtest(hap1_count, hap1_count + hap2_count, 0.5) hap1_allele_ratio = float(hap1_count) / (float(hap1_count) + float(hap2_count)) sys.stdout.write('\t'.join([ e, str(hap1_count), str(hap2_count), str(hap1_allele_ratio), str(pbinom), str(e_dict[e]['snv_count']), e_dict[e]['snvs'] ]) + '\n') elif sys.argv[1] == 'mmap': sys.stdout.write('\t'.join(['#region', 'hap1_count', 'hap2_count']) + '\n')
region, weaker_hap + ':' + str(weaker_hap_mm_count) + ';within_thresh') # or adjust counts in the most conservative way: add the mm counts to the weaker allele: # all of them or until balanced with the stonger # to make sure the count imbalance is not caused by the multimapping reads else: adj = min((weaker_hap_un_count + weaker_hap_mm_count), max(int(hap1_count), int(hap2_count))) diff = adj - weaker_hap_un_count if weaker_hap == 'hap1': hap1_count = adj else: hap2_count = adj new_tot = int(hap1_count) + int(hap2_count) new_hap1_allele_ratio = float(hap1_count) / float(new_tot) new_p_binom = binom.binomtest(int(hap1_count), new_tot, 0.5) sys.stdout.write('\t'.join([ region, str(hap1_count), str(hap2_count), str(new_hap1_allele_ratio), str(new_p_binom), snv_count, snv_hap1_hap2_coords, weaker_hap + ':+' + str(int(diff)) ]) + '\n') #logwrite(line, mm_counts_dict[region]['hap1_count'], mm_counts_dict[region]['hap2_count'], weaker_hap+':+'+str(int(diff))) logwrite(region, mm_counts_dict[region]['hap1_count'], mm_counts_dict[region]['hap2_count'], weaker_hap + ':+' + str(int(diff)))