def estimate_frequency(motif,k,samples=100000,thresh=0.7): #Build sequences estimate = -30 total = 0 totalcount = 0 for i in range(40): long_string = 'ACGT'*(int(float(samples)*k/4)) long_string = list(long_string) random.shuffle(long_string) random.shuffle(long_string) random.shuffle(long_string) long_string = ''.join(long_string) seqD = {} for i in range(samples): offset = k*i seqD[i] = long_string[offset:offset+k] P = ProbeSet(genome=seqD) count = P.count_matching_probes(motif,thresh=thresh) total += float(samples) totalcount += float(count) f = totalcount/total d = math.fabs(f-estimate)/(estimate+0.00000001) estimate = f if d < 1e-4: break if i > 2 and totalcount > 100: break #print '%10d %10d %12.3e %12.3e'%(totalcount, total, f, d) return estimate
def estimate_frequency(motif, k, samples=100000, thresh=0.7): #Build sequences estimate = -30 total = 0 totalcount = 0 for i in range(40): long_string = 'ACGT' * (int(float(samples) * k / 4)) long_string = list(long_string) random.shuffle(long_string) random.shuffle(long_string) random.shuffle(long_string) long_string = ''.join(long_string) seqD = {} for i in range(samples): offset = k * i seqD[i] = long_string[offset:offset + k] P = ProbeSet(genome=seqD) count = P.count_matching_probes(motif, thresh=thresh) total += float(samples) totalcount += float(count) f = totalcount / total d = math.fabs(f - estimate) / (estimate + 0.00000001) estimate = f if d < 1e-4: break if i > 2 and totalcount > 100: break #print '%10d %10d %12.3e %12.3e'%(totalcount, total, f, d) return estimate
def probOvlpBinomial(A,B,thresh=0.7,verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement newWide = Wide[-1,Wide.width+1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh*newWide.maxscore): bestWideD[x[1]] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide D={} for i in range(len(bestWide)): D[i] = bestWide[i] P = ProbeSet(genome=D) matchNarrow = P.count_matching_probes(Narrow,thresh=thresh) if matchNarrow == 0: p = 1.0 return p if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {} if Narrow.probNarrow.has_key(Wide.width): probNarrow = Narrow.probNarrow[Wide.width] else: probNarrow = estimate_frequency(Narrow,Wide.width,thresh=thresh) Narrow.probNarrow[Wide.width] = probNarrow p = Arith.binomialsumtail(probNarrow,len(bestWide),matchNarrow) print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s'%( Arith.nlog10(p),probNarrow,len(bestWide),matchNarrow, A.family,A,B.family,B) return p
def probOvlpBinomial(A, B, thresh=0.7, verbose=None): if A.width >= B.width: Wide, Narrow = A, B else: Wide, Narrow = B, A RC = MotifTools.revcomplement newWide = Wide[-1, Wide.width + 1] if Wide.__dict__.has_key('bestWide'): bestWide = Wide.bestWide else: bestWideD = {} for x in newWide.bestseqs(thresh * newWide.maxscore): bestWideD[x[1]] = 1 for x in bestWideD.keys(): bestWideD[RC(x)] = 1 Wide.bestWide = bestWideD.keys() bestWide = Wide.bestWide Wide = newWide D = {} for i in range(len(bestWide)): D[i] = bestWide[i] P = ProbeSet(genome=D) matchNarrow = P.count_matching_probes(Narrow, thresh=thresh) if matchNarrow == 0: p = 1.0 return p if not Narrow.__dict__.has_key('probNarrow'): Narrow.probNarrow = {} if Narrow.probNarrow.has_key(Wide.width): probNarrow = Narrow.probNarrow[Wide.width] else: probNarrow = estimate_frequency(Narrow, Wide.width, thresh=thresh) Narrow.probNarrow[Wide.width] = probNarrow p = Arith.binomialsumtail(probNarrow, len(bestWide), matchNarrow) print '\nD= %7.3f %9.4e %8d %7d %-14s %-20s %-14s %-20s' % (Arith.nlog10( p), probNarrow, len(bestWide), matchNarrow, A.family, A, B.family, B) return p