def rejection_sample_site((matrix, mu, Ne)): psfm = psfm_from_matrix(matrix) log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site:score_seq(log_psfm, site) log_M = -sum(map(max,psfm)) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites] log_qs = [log_psfm_prob(site) for site in sites] ars = [exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs)]
def rejection_sample_site((matrix, mu, Ne)): psfm = psfm_from_matrix(matrix) log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site: score_seq(log_psfm, site) log_M = -sum(map(max, psfm)) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites] log_qs = [log_psfm_prob(site) for site in sites] ars = [ exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs) ]
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([sum(ep*p for ep,p in zip(eps, ps)) for eps, ps in zip(matrix, psfm)]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f,-20,20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [-nu*log(1+exp(score_seq(matrix, site) - mu)) for site in sites] log_qs = [score_seq(log_psfm, site) for site in sites]
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([ sum(ep * p for ep, p in zip(eps, ps)) for eps, ps in zip(matrix, psfm) ]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f, -20, 20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [ -nu * log(1 + exp(score_seq(matrix, site) - mu)) for site in sites ] log_qs = [score_seq(log_psfm, site) for site in sites]
def main_experiment(generate_data=False): if generate_data: iterations = 10000 prok_chains = [ posterior_chain2(motif, iterations=iterations) for motif in tqdm(prok_motifs) ] prok_bayes_spoofs = [[ motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations / 2::500]) ] for chain, motif in tqdm(zip(prok_chains, prok_motifs))] prok_psfms = [ psfm_from_motif(motif, pc=1 / 4.0) for motif in prok_motifs ] prok_psfm_spoofs = [[[ sample_from_psfm(psfm) for _ in range(len(motif)) ] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)] prok_maxent_spoofs = [ spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs) ] prok_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0), tqdm(prok_motifs)) prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(prok_apws, prok_motifs))] euk_submotifs = map(subsample, euk_motifs) euk_chains = [ posterior_chain2(motif, iterations=iterations) for motif in tqdm(euk_submotifs) ] euk_bayes_spoofs = [[ motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations / 2::500]) ] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))] euk_psfms = [ psfm_from_motif(motif, pc=1 / 4.0) for motif in euk_submotifs ] euk_psfm_spoofs = [[[ sample_from_psfm(psfm) for _ in range(len(motif)) ] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)] euk_maxent_spoofs = [ spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs) ] euk_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0), tqdm(euk_submotifs)) euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(euk_apws, euk_submotifs))] with open("prok_chains.pkl", 'w') as f: cPickle.dump(prok_chains, f) with open("prok_bayes_spoofs.pkl", 'w') as f: cPickle.dump(prok_bayes_spoofs, f) with open("prok_maxent_spoofs.pkl", 'w') as f: cPickle.dump(prok_maxent_spoofs, f) with open("prok_psfm_spoofs.pkl", 'w') as f: cPickle.dump(prok_psfm_spoofs, f) with open("prok_apw_spoofs.pkl", 'w') as f: cPickle.dump(prok_apw_spoofs, f) with open("euk_submotifs.pkl", 'w') as f: cPickle.dump(euk_submotifs, f) with open("euk_chains.pkl", 'w') as f: cPickle.dump(euk_chains, f) with open("euk_bayes_spoofs.pkl", 'w') as f: cPickle.dump(euk_bayes_spoofs, f) with open("euk_maxent_spoofs.pkl", 'w') as f: cPickle.dump(euk_maxent_spoofs, f) with open("euk_psfm_spoofs.pkl", 'w') as f: cPickle.dump(euk_psfm_spoofs, f) with open("euk_apw_spoofs.pkl", 'w') as f: cPickle.dump(euk_apw_spoofs, f) else: with open("prok_chains.pkl") as f: prok_chains = cPickle.load(f) with open("prok_bayes_spoofs.pkl") as f: prok_bayes_spoofs = cPickle.load(f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("prok_psfm_spoofs.pkl") as f: prok_psfm_spoofs = cPickle.load(f) with open("prok_apw_spoofs.pkl") as f: prok_apw_spoofs = cPickle.load(f) with open("euk_submotifs.pkl") as f: euk_submotifs = cPickle.load(f) with open("euk_chains.pkl") as f: euk_chains = cPickle.load(f) with open("euk_bayes_spoofs.pkl") as f: euk_bayes_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("euk_apw_spoofs.pkl") as f: euk_apw_spoofs = cPickle.load(f) with open("euk_psfm_spoofs.pkl") as f: euk_psfm_spoofs = cPickle.load(f) #-------- prok_ics = map(motif_ic, prok_motifs) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_ics = [mean(map(motif_ic, xs)) for xs in prok_maxent_spoofs] prok_maxent_mis = [mean(map(mi_per_col, xs)) for xs in prok_maxent_spoofs] prok_psfm_ics = [mean(map(motif_ic, xs)) for xs in prok_psfm_spoofs] prok_psfm_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(prok_psfm_spoofs) ] prok_bayes_ics = [mean(map(motif_ic, xs)) for xs in prok_bayes_spoofs] prok_bayes_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(prok_bayes_spoofs) ] prok_apw_ics = [mean(map(motif_ic, xs)) for xs in prok_apw_spoofs] prok_apw_mis = [mean(map(mi_per_col, xs)) for xs in prok_apw_spoofs] prok_ics_pp = map(motif_ic_per_col, prok_motifs) prok_maxent_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_maxent_spoofs ] prok_psfm_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_psfm_spoofs ] prok_bayes_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_bayes_spoofs ] prok_apw_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_apw_spoofs ] #-------- euk_ics = map(motif_ic, tqdm(euk_submotifs)) euk_mis = map(mi_per_col, tqdm(euk_submotifs)) euk_maxent_ics = [ mean(map(motif_ic, xs)) for xs in tqdm(euk_maxent_spoofs) ] euk_maxent_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(euk_maxent_spoofs) ] euk_psfm_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_psfm_spoofs)] euk_psfm_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_psfm_spoofs)] euk_bayes_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_bayes_spoofs)] euk_bayes_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(euk_bayes_spoofs) ] euk_apw_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_apw_spoofs)] euk_apw_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_apw_spoofs)] euk_ics_pp = map(motif_ic_per_col, euk_motifs) euk_maxent_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_maxent_spoofs ] euk_psfm_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_psfm_spoofs ] euk_bayes_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_bayes_spoofs ] euk_apw_ics_pp = [mean(map(motif_ic_per_col, xs)) for xs in euk_apw_spoofs] #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7 ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1 #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5 ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85 mi_xticks = [0, 0.25, 0.5, 0.75, 1] ic_yticks = [0, 0.5, 1, 1.5, 2] revscatter = lambda xs, ys: scatter(ys, xs) sns.set_style('dark') plt.subplot(4, 4, 1) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("MaxEnt", fontsize='large') plt.subplot(4, 4, 3) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_maxent_mis) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 5) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("PSFM", fontsize='large') plt.subplot(4, 4, 7) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 9) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_apw_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_apw_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("APW", fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4, 4, 11) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_apw_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 13) #plt.xticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok IC", fontsize='large') plt.ylabel("Bayes", fontsize='large') plt.subplot(4, 4, 15) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_bayes_mis) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok MI", fontsize='large') #--- euk plots ---# plt.subplot(4, 4, 2) plt.xticks([]) plt.yticks([]) r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4, 4, 4) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 6) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("PSFM",fontsize='large') plt.subplot(4, 4, 8) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 10) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_apw_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_apw_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4, 4, 12) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_apw_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 14) #plt.xticks([]) # plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("Bayes",fontsize='large') plt.xlabel("Euk IC", fontsize='large') plt.subplot(4, 4, 16) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.axis('off') #plt.xlabel("MI (bits/column pair)",fontsize='large') plt.xlabel("Euk MI", fontsize='large') plt.tight_layout() maybesave("spoof-statistics-rmsd.pdf")
def prop(_): return sample_from_psfm(tilted_psfm)
mat_mu = sum(map(mean, matrix)) mat_sigma = sqrt(sum(map(lambda xs: variance(xs, correct=False), matrix))) log_perc_below_threshold = norm.logcdf(mu - log((Ne - 1)), mat_mu, mat_sigma) log_Zs = L * log(4) + log_perc_below_threshold ans_ref = ((N * L * log(4)) + log_perc_below_threshold) ans = N * log_Zs return ans def log_ZS_importance((matrix, mu, Ne), trials=1000): L = len(matrix) psfm = psfm_from_matrix(matrix) log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site: score_seq(log_psfm, site) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] mean_ZS = mean( exp( log_fhat((matrix, mu, Ne), [site]) + log(1.0 / 4**L) - log_psfm_prob(site)) for site in sites) ZS = 4**L * mean_ZS return log(ZS) def log_ZS_importance_ref((matrix, mu, Ne), trials=1000): L = len(matrix) psfm = [[0.25] * 4 for _ in range(L)] log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site: score_seq(log_psfm, site) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] mean_ZS = mean(
def main_experiment(generate_data=False): if generate_data: iterations = 10000 prok_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(prok_motifs)] prok_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])] for chain, motif in tqdm(zip(prok_chains, prok_motifs))] prok_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in prok_motifs] prok_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)] prok_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs)] prok_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(prok_motifs)) prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(prok_apws,prok_motifs))] euk_submotifs = map(subsample, euk_motifs) euk_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(euk_submotifs)] euk_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))] euk_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in euk_submotifs] euk_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)] euk_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs)] euk_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(euk_submotifs)) euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(euk_apws,euk_submotifs))] with open("prok_chains.pkl",'w') as f: cPickle.dump(prok_chains,f) with open("prok_bayes_spoofs.pkl",'w') as f: cPickle.dump(prok_bayes_spoofs,f) with open("prok_maxent_spoofs.pkl",'w') as f: cPickle.dump(prok_maxent_spoofs,f) with open("prok_psfm_spoofs.pkl",'w') as f: cPickle.dump(prok_psfm_spoofs,f) with open("prok_apw_spoofs.pkl",'w') as f: cPickle.dump(prok_apw_spoofs,f) with open("euk_submotifs.pkl",'w') as f: cPickle.dump(euk_submotifs,f) with open("euk_chains.pkl",'w') as f: cPickle.dump(euk_chains,f) with open("euk_bayes_spoofs.pkl",'w') as f: cPickle.dump(euk_bayes_spoofs,f) with open("euk_maxent_spoofs.pkl",'w') as f: cPickle.dump(euk_maxent_spoofs,f) with open("euk_psfm_spoofs.pkl",'w') as f: cPickle.dump(euk_psfm_spoofs,f) with open("euk_apw_spoofs.pkl",'w') as f: cPickle.dump(euk_apw_spoofs,f) else: with open("prok_chains.pkl") as f: prok_chains = cPickle.load(f) with open("prok_bayes_spoofs.pkl") as f: prok_bayes_spoofs = cPickle.load(f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("prok_psfm_spoofs.pkl") as f: prok_psfm_spoofs = cPickle.load(f) with open("prok_apw_spoofs.pkl") as f: prok_apw_spoofs = cPickle.load(f) with open("euk_submotifs.pkl") as f: euk_submotifs = cPickle.load(f) with open("euk_chains.pkl") as f: euk_chains = cPickle.load(f) with open("euk_bayes_spoofs.pkl") as f: euk_bayes_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("euk_apw_spoofs.pkl") as f: euk_apw_spoofs = cPickle.load(f) with open("euk_psfm_spoofs.pkl") as f: euk_psfm_spoofs = cPickle.load(f) #-------- prok_ics = map(motif_ic, prok_motifs) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_ics = [mean(map(motif_ic,xs)) for xs in prok_maxent_spoofs] prok_maxent_mis = [mean(map(mi_per_col,xs)) for xs in prok_maxent_spoofs] prok_psfm_ics = [mean(map(motif_ic,xs)) for xs in prok_psfm_spoofs] prok_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_psfm_spoofs)] prok_bayes_ics = [mean(map(motif_ic,xs)) for xs in prok_bayes_spoofs] prok_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_bayes_spoofs)] prok_apw_ics = [mean(map(motif_ic,xs)) for xs in prok_apw_spoofs] prok_apw_mis = [mean(map(mi_per_col,xs)) for xs in prok_apw_spoofs] prok_ics_pp = map(motif_ic_per_col, prok_motifs) prok_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_maxent_spoofs] prok_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_psfm_spoofs] prok_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_bayes_spoofs] prok_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_apw_spoofs] #-------- euk_ics = map(motif_ic, tqdm(euk_submotifs)) euk_mis = map(mi_per_col, tqdm(euk_submotifs)) euk_maxent_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_maxent_spoofs)] euk_maxent_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_maxent_spoofs)] euk_psfm_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_psfm_spoofs)] euk_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_psfm_spoofs)] euk_bayes_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_bayes_spoofs)] euk_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_bayes_spoofs)] euk_apw_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_apw_spoofs)] euk_apw_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_apw_spoofs)] euk_ics_pp = map(motif_ic_per_col, euk_motifs) euk_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_maxent_spoofs] euk_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_psfm_spoofs] euk_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_bayes_spoofs] euk_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_apw_spoofs] #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7 ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1 #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5 ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85 mi_xticks = [0, 0.25, 0.5, 0.75, 1] ic_yticks = [0, 0.5, 1, 1.5, 2] revscatter = lambda xs, ys:scatter(ys, xs) sns.set_style('dark') plt.subplot(4,4,1) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4,4,3) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,5) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("PSFM",fontsize='large') plt.subplot(4,4,7) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,9) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_apw_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_apw_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4,4,11) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_apw_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,13) #plt.xticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok IC",fontsize='large') plt.ylabel("Bayes",fontsize='large') plt.subplot(4,4,15) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok MI",fontsize='large') #--- euk plots ---# plt.subplot(4,4,2) plt.xticks([]) plt.yticks([]) r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4,4,4) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,6) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("PSFM",fontsize='large') plt.subplot(4,4,8) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,10) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_apw_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_apw_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4,4,12) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_apw_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,14) #plt.xticks([]) # plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("Bayes",fontsize='large') plt.xlabel("Euk IC",fontsize='large') plt.subplot(4,4,16) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.axis('off') #plt.xlabel("MI (bits/column pair)",fontsize='large') plt.xlabel("Euk MI",fontsize='large') plt.tight_layout() maybesave("spoof-statistics-rmsd.pdf")
def log_Z_hack((matrix, mu, Ne), N): L = len(matrix) mat_mu = sum(map(mean,matrix)) mat_sigma = sqrt(sum(map(lambda xs:variance(xs,correct=False), matrix))) log_perc_below_threshold = norm.logcdf(mu - log((Ne-1)), mat_mu, mat_sigma) log_Zs = L * log(4) + log_perc_below_threshold ans_ref = ((N*L * log(4)) + log_perc_below_threshold) ans = N * log_Zs return ans def log_ZS_importance((matrix, mu, Ne), trials=1000): L = len(matrix) psfm = psfm_from_matrix(matrix) log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site:score_seq(log_psfm, site) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] mean_ZS = mean(exp(log_fhat((matrix, mu, Ne), [site]) + log(1.0/4**L) - log_psfm_prob(site)) for site in sites) ZS = 4**L * mean_ZS return log(ZS) def log_ZS_importance_ref((matrix, mu, Ne), trials=1000): L = len(matrix) psfm = [[0.25]*4 for _ in range(L)] log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site:score_seq(log_psfm, site) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] mean_ZS = mean(exp(log_fhat((matrix, mu, Ne), [site]) + log(1.0/4**L) - log_psfm_prob(site)) for site in sites) ZS = 4**L * mean_ZS return log(ZS)