def _show2d(self, bmin=None, bmax=None, margin=0.2, N_grid=20, levels=None, scatter_rule=None): """ scatter_rule: Y -> [(idxes_1, "blue"), ..., (idxes_n, "red")] """ def func(x): mu, var = self.predict(x) return mu, mu if (bmin is None) or (bmax is None): bmin, bmax = self.get_boundary(margin=margin) fig, ax = plt.subplots() fax = (fig, ax) utils.show2d(func, bmin, bmax, fax=fax, levels=levels, N=N_grid) if scatter_rule is None: ## Y -> [(idxes_1, c1), ..., (idxes_n, cn)] def f(Y): idxes = range(len(Y)) color = "red" return [(idxes, color)] scatter_rule = f utils.scatter(scatter_rule, self.X, self.Y, fax)
def show_best_x2(self, img1, img2, x1, x2, l): f = plt.figure() f.add_subplot(1, 2, 1) plt.imshow(img1) scatter(x1) f.add_subplot(1, 2, 2) self.plot_epipolar_line(img2, l, show=False) scatter(x2) plt.show()
def prok_model_comparison(): sys.path.append("/home/pat/motifs") from parse_tfbs_data import tfdf prok_motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] prok_comps = [model_comparison(motif) for motif in tqdm(prok_motifs)] pw_bics, li_bics = transpose(prok_comps) scatter(li_bics, pw_bics) plt.xlabel("Linear BIC") plt.ylabel("Pairwise BIC") plt.loglog()
def misclassification_polytope(a, c, ls): """creates misclassification constraints""" assert a.ndim == 2 assert a.shape[0] == 1 # only batch size 1 is supported n_classes = a.shape[1] u = a[:, ls] - a[:, c] c = np.atleast_1d(np.asarray([c]).squeeze()) ls = np.atleast_1d(np.asarray([ls]).squeeze()) Av = lambda Vv: Vv[:, c] - Vv[:, ls] # noqa: E731 vA = lambda v: ( scatter(c, np.sum(np.atleast_2d(v), axis=-1, keepdims=True), n_classes) + # noqa: E731 scatter(ls, -np.atleast_2d(v), n_classes)) return Av, vA, u
def test_predict_ic(trials=100): pred_ics = [] obs_ics = [] for trial in trange(trials): sigma = random.random() * 5 + 0.1 L = random.randrange(5, 15) matrix = sample_matrix(L, sigma) mu = random.random() * (-20) Ne = random.random() * 5 + 1 pred_ic = predict_ic(matrix, mu, Ne) obs_ic = motif_ic(sample_motif_cftp(matrix, mu, Ne, n=100)) pred_ics.append(pred_ic) obs_ics.append(obs_ic) r, p = scatter(pred_ics, obs_ics) print r, p
def test_predict_ic_from_theta(trials=100, num_matrices=10): pred_ics = [] obs_ics = [] for trial in trange(trials): sigma = random.random() * 5 + 0.1 L = random.randrange(5, 15) mu = random.random() * (-20) Ne = random.random() * 5 + 1 theta = sigma, mu, Ne pred_ic = predict_ic_from_theta(theta, L, num_matrices=num_matrices) obs_ic = observe_ic_from_theta(theta, L, num_matrices=num_matrices) pred_ics.append(pred_ic) obs_ics.append(obs_ic) print len(pred_ics), len(obs_ics) r, p = scatter(pred_ics, obs_ics) print r, p
def experiment2(trials=100): """APW models win, presumably because they have larger sigmas""" scatter(*transpose([ map(lambda xs: mean(map(log, xs)), experiment2_()) for i in trange(trials) ]))
def main_experiment(generate_data=False): if generate_data: iterations = 10000 prok_chains = [ posterior_chain2(motif, iterations=iterations) for motif in tqdm(prok_motifs) ] prok_bayes_spoofs = [[ motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations / 2::500]) ] for chain, motif in tqdm(zip(prok_chains, prok_motifs))] prok_psfms = [ psfm_from_motif(motif, pc=1 / 4.0) for motif in prok_motifs ] prok_psfm_spoofs = [[[ sample_from_psfm(psfm) for _ in range(len(motif)) ] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)] prok_maxent_spoofs = [ spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs) ] prok_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0), tqdm(prok_motifs)) prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(prok_apws, prok_motifs))] euk_submotifs = map(subsample, euk_motifs) euk_chains = [ posterior_chain2(motif, iterations=iterations) for motif in tqdm(euk_submotifs) ] euk_bayes_spoofs = [[ motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations / 2::500]) ] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))] euk_psfms = [ psfm_from_motif(motif, pc=1 / 4.0) for motif in euk_submotifs ] euk_psfm_spoofs = [[[ sample_from_psfm(psfm) for _ in range(len(motif)) ] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)] euk_maxent_spoofs = [ spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs) ] euk_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0), tqdm(euk_submotifs)) euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(euk_apws, euk_submotifs))] with open("prok_chains.pkl", 'w') as f: cPickle.dump(prok_chains, f) with open("prok_bayes_spoofs.pkl", 'w') as f: cPickle.dump(prok_bayes_spoofs, f) with open("prok_maxent_spoofs.pkl", 'w') as f: cPickle.dump(prok_maxent_spoofs, f) with open("prok_psfm_spoofs.pkl", 'w') as f: cPickle.dump(prok_psfm_spoofs, f) with open("prok_apw_spoofs.pkl", 'w') as f: cPickle.dump(prok_apw_spoofs, f) with open("euk_submotifs.pkl", 'w') as f: cPickle.dump(euk_submotifs, f) with open("euk_chains.pkl", 'w') as f: cPickle.dump(euk_chains, f) with open("euk_bayes_spoofs.pkl", 'w') as f: cPickle.dump(euk_bayes_spoofs, f) with open("euk_maxent_spoofs.pkl", 'w') as f: cPickle.dump(euk_maxent_spoofs, f) with open("euk_psfm_spoofs.pkl", 'w') as f: cPickle.dump(euk_psfm_spoofs, f) with open("euk_apw_spoofs.pkl", 'w') as f: cPickle.dump(euk_apw_spoofs, f) else: with open("prok_chains.pkl") as f: prok_chains = cPickle.load(f) with open("prok_bayes_spoofs.pkl") as f: prok_bayes_spoofs = cPickle.load(f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("prok_psfm_spoofs.pkl") as f: prok_psfm_spoofs = cPickle.load(f) with open("prok_apw_spoofs.pkl") as f: prok_apw_spoofs = cPickle.load(f) with open("euk_submotifs.pkl") as f: euk_submotifs = cPickle.load(f) with open("euk_chains.pkl") as f: euk_chains = cPickle.load(f) with open("euk_bayes_spoofs.pkl") as f: euk_bayes_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("euk_apw_spoofs.pkl") as f: euk_apw_spoofs = cPickle.load(f) with open("euk_psfm_spoofs.pkl") as f: euk_psfm_spoofs = cPickle.load(f) #-------- prok_ics = map(motif_ic, prok_motifs) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_ics = [mean(map(motif_ic, xs)) for xs in prok_maxent_spoofs] prok_maxent_mis = [mean(map(mi_per_col, xs)) for xs in prok_maxent_spoofs] prok_psfm_ics = [mean(map(motif_ic, xs)) for xs in prok_psfm_spoofs] prok_psfm_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(prok_psfm_spoofs) ] prok_bayes_ics = [mean(map(motif_ic, xs)) for xs in prok_bayes_spoofs] prok_bayes_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(prok_bayes_spoofs) ] prok_apw_ics = [mean(map(motif_ic, xs)) for xs in prok_apw_spoofs] prok_apw_mis = [mean(map(mi_per_col, xs)) for xs in prok_apw_spoofs] prok_ics_pp = map(motif_ic_per_col, prok_motifs) prok_maxent_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_maxent_spoofs ] prok_psfm_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_psfm_spoofs ] prok_bayes_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_bayes_spoofs ] prok_apw_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in prok_apw_spoofs ] #-------- euk_ics = map(motif_ic, tqdm(euk_submotifs)) euk_mis = map(mi_per_col, tqdm(euk_submotifs)) euk_maxent_ics = [ mean(map(motif_ic, xs)) for xs in tqdm(euk_maxent_spoofs) ] euk_maxent_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(euk_maxent_spoofs) ] euk_psfm_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_psfm_spoofs)] euk_psfm_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_psfm_spoofs)] euk_bayes_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_bayes_spoofs)] euk_bayes_mis = [ mean(map(mi_per_col, xs)) for xs in tqdm(euk_bayes_spoofs) ] euk_apw_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_apw_spoofs)] euk_apw_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_apw_spoofs)] euk_ics_pp = map(motif_ic_per_col, euk_motifs) euk_maxent_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_maxent_spoofs ] euk_psfm_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_psfm_spoofs ] euk_bayes_ics_pp = [ mean(map(motif_ic_per_col, xs)) for xs in euk_bayes_spoofs ] euk_apw_ics_pp = [mean(map(motif_ic_per_col, xs)) for xs in euk_apw_spoofs] #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7 ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1 #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5 ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85 mi_xticks = [0, 0.25, 0.5, 0.75, 1] ic_yticks = [0, 0.5, 1, 1.5, 2] revscatter = lambda xs, ys: scatter(ys, xs) sns.set_style('dark') plt.subplot(4, 4, 1) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("MaxEnt", fontsize='large') plt.subplot(4, 4, 3) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_maxent_mis) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 5) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("PSFM", fontsize='large') plt.subplot(4, 4, 7) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 9) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_apw_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_apw_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("APW", fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4, 4, 11) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_apw_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 13) #plt.xticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok IC", fontsize='large') plt.ylabel("Bayes", fontsize='large') plt.subplot(4, 4, 15) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_bayes_mis) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok MI", fontsize='large') #--- euk plots ---# plt.subplot(4, 4, 2) plt.xticks([]) plt.yticks([]) r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4, 4, 4) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 6) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("PSFM",fontsize='large') plt.subplot(4, 4, 8) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 10) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_apw_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_apw_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4, 4, 12) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_apw_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4, 4, 14) #plt.xticks([]) # plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp) rmsd = sqrt( mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("Bayes",fontsize='large') plt.xlabel("Euk IC", fontsize='large') plt.subplot(4, 4, 16) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd) #plt.axis('off') #plt.xlabel("MI (bits/column pair)",fontsize='large') plt.xlabel("Euk MI", fontsize='large') plt.tight_layout() maybesave("spoof-statistics-rmsd.pdf")
def grand_spoofing_experiment(prok_motifs, euk_motifs): # should we subsample once or each time?? prok_maxent_spoofs = [spoof_maxent_motifs(motif,10) for motif in tqdm(prok_motifs)] euk_maxent_spoofs = [spoof_maxent_motifs(subsample(motif), 10) for motif in tqdm(euk_motifs)] prok_cftp_spoofs = [spoof_motif_cftp_occ(motif,10) for motif in tqdm(prok_motifs)] euk_cftp_spoofs = [spoof_motif_cftp_occ(subsample(motif),10) for motif in tqdm(euk_motifs)] prok_oo_spoofs = [spoof_oo_motifs(motif,10) for motif in tqdm(prok_motifs)] prok_oo_occ_spoofs = [spoof_oo_motifs_occ(motif,10) for motif in tqdm(prok_motifs)] euk_oo_spoofs = [spoof_oo_motifs(subsample(motif),10) for motif in tqdm(euk_motifs)] euk_oo_occ_spoofs = [spoof_oo_motifs_occ(motif,10) for motif in tqdm(euk_motifs)] with open("prok_maxent_spoofs",'w') as f: cPickle.dump(prok_maxent_spoofs, f) with open("euk_maxent_spoofs",'w') as f: cPickle.dump(euk_maxent_spoofs, f) with open("prok_cftp_spoofs",'w') as f: cPickle.dump(prok_cftp_spoofs, f) with open("euk_cftp_spoofs",'w') as f: cPickle.dump(euk_cftp_spoofs, f) with open("prok_oo_spoofs",'w') as f: cPickle.dump(prok_oo_spoofs, f) with open("euk_oo_spoofs",'w') as f: cPickle.dump(euk_oo_spoofs, f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("prok_cftp_spoofs") as f: prok_cftp_spoofs = cPickle.load(f) with open("euk_cftp_spoofs") as f: euk_cftp_spoofs = cPickle.load(f) with open("prok_oo_spoofs.pkl") as f: prok_oo_spoofs = cPickle.load(f) with open("euk_oo_spoofs.pkl") as f: euk_oo_spoofs = cPickle.load(f) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_maxent_spoofs)] euk_mis = map(mi_per_col, map(subsample,euk_motifs)) euk_maxent_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_maxent_spoofs)] prok_cftp_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_cftp_spoofs)] euk_cftp_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_cftp_spoofs)] prok_oo_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(prok_oo_spoofs)] euk_oo_mis = [mean(map(mi_per_col, spoofs)) for spoofs in tqdm(euk_oo_spoofs)] plt.subplot(1,3,1) scatter(prok_maxent_mis, prok_mis) plt.xlabel("Predicted MI",fontsize='large') plt.ylabel("Observed MI",fontsize='large') plt.title("MaxEnt",fontsize='large') scatter(euk_maxent_mis, euk_mis,color='g') plt.subplot(1,3,2) scatter(prok_cftp_mis, prok_mis) scatter(euk_cftp_mis, euk_mis,color='g') plt.xlabel("Predicted MI",fontsize='large') plt.ylabel("Observed MI",fontsize='large') plt.title("Gaussian Linear Ensemble",fontsize='large') plt.subplot(1,3,3) scatter(prok_oo_mis, prok_mis) scatter(euk_oo_mis, euk_mis,color='g') plt.xlabel("Predicted MI",fontsize='large') plt.ylabel("Observed MI",fontsize='large') plt.title("Match-Mismatch",fontsize='large') plt.tight_layout() maybesave("mi-spoof-plot.eps")
def mi_sampling_experiment(): prok_maxent_motifs = [spoof_maxent_motifs(motif,100) for motif in tqdm(prok_motifs)] prok_cftp_motifs = [spoof_motif_cftp_occ(motif,10) for motif in tqdm(prok_motifs)] motif_mi_nc = lambda m:motif_mi(m,correct=False) scatter(map(motif_mi, prok_motifs), [mean(map(motif_mi,spoofs)) for spoofs in tqdm(prok_maxent_motifs)]) scatter(map(motif_mi, prok_motifs), [mean(map(motif_mi,spoofs)) for spoofs in tqdm(prok_cftp_motifs)])
st.sidebar.markdown("② ** Strain Selection **") strains, strain_formats = strain_counts(strain_category_data) strain_value = st.sidebar.selectbox(label="Strain (Test Count)", options=strains, format_func=strain_formats.get) strain_data = strain_category_data.query("test_strain == @strain_value") st.sidebar.markdown("---") st.sidebar.markdown("ℹ️ ** Details **") desc_check = st.sidebar.checkbox("📃 Dataset Description") desc_markdown = read_markdown_file("data_description.md") dict_check = st.sidebar.checkbox("📕 Data Dictionary") dict_markdown = read_markdown_file("data_dictionary.md") if desc_check: st.sidebar.markdown(desc_markdown, unsafe_allow_html=True) if dict_check: st.sidebar.markdown(dict_markdown, unsafe_allow_html=True) st.sidebar.code(pformat(colnames, indent=2)) st.markdown("---") st.markdown("## Strain Testing Data") st.altair_chart(scatter(strain_data)) st.altair_chart(line(strain_data)) st.markdown("### Top 10 Highest THC Measurements") styled_test_table = get_top_test_table(strain_data) st.table(styled_test_table)
def main_experiment(generate_data=False): if generate_data: iterations = 10000 prok_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(prok_motifs)] prok_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])] for chain, motif in tqdm(zip(prok_chains, prok_motifs))] prok_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in prok_motifs] prok_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)] prok_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs)] prok_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(prok_motifs)) prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(prok_apws,prok_motifs))] euk_submotifs = map(subsample, euk_motifs) euk_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(euk_submotifs)] euk_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))] euk_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in euk_submotifs] euk_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)] euk_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs)] euk_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(euk_submotifs)) euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)] for apw, motif in tqdm(zip(euk_apws,euk_submotifs))] with open("prok_chains.pkl",'w') as f: cPickle.dump(prok_chains,f) with open("prok_bayes_spoofs.pkl",'w') as f: cPickle.dump(prok_bayes_spoofs,f) with open("prok_maxent_spoofs.pkl",'w') as f: cPickle.dump(prok_maxent_spoofs,f) with open("prok_psfm_spoofs.pkl",'w') as f: cPickle.dump(prok_psfm_spoofs,f) with open("prok_apw_spoofs.pkl",'w') as f: cPickle.dump(prok_apw_spoofs,f) with open("euk_submotifs.pkl",'w') as f: cPickle.dump(euk_submotifs,f) with open("euk_chains.pkl",'w') as f: cPickle.dump(euk_chains,f) with open("euk_bayes_spoofs.pkl",'w') as f: cPickle.dump(euk_bayes_spoofs,f) with open("euk_maxent_spoofs.pkl",'w') as f: cPickle.dump(euk_maxent_spoofs,f) with open("euk_psfm_spoofs.pkl",'w') as f: cPickle.dump(euk_psfm_spoofs,f) with open("euk_apw_spoofs.pkl",'w') as f: cPickle.dump(euk_apw_spoofs,f) else: with open("prok_chains.pkl") as f: prok_chains = cPickle.load(f) with open("prok_bayes_spoofs.pkl") as f: prok_bayes_spoofs = cPickle.load(f) with open("prok_maxent_spoofs.pkl") as f: prok_maxent_spoofs = cPickle.load(f) with open("prok_psfm_spoofs.pkl") as f: prok_psfm_spoofs = cPickle.load(f) with open("prok_apw_spoofs.pkl") as f: prok_apw_spoofs = cPickle.load(f) with open("euk_submotifs.pkl") as f: euk_submotifs = cPickle.load(f) with open("euk_chains.pkl") as f: euk_chains = cPickle.load(f) with open("euk_bayes_spoofs.pkl") as f: euk_bayes_spoofs = cPickle.load(f) with open("euk_maxent_spoofs.pkl") as f: euk_maxent_spoofs = cPickle.load(f) with open("euk_apw_spoofs.pkl") as f: euk_apw_spoofs = cPickle.load(f) with open("euk_psfm_spoofs.pkl") as f: euk_psfm_spoofs = cPickle.load(f) #-------- prok_ics = map(motif_ic, prok_motifs) prok_mis = map(mi_per_col, prok_motifs) prok_maxent_ics = [mean(map(motif_ic,xs)) for xs in prok_maxent_spoofs] prok_maxent_mis = [mean(map(mi_per_col,xs)) for xs in prok_maxent_spoofs] prok_psfm_ics = [mean(map(motif_ic,xs)) for xs in prok_psfm_spoofs] prok_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_psfm_spoofs)] prok_bayes_ics = [mean(map(motif_ic,xs)) for xs in prok_bayes_spoofs] prok_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_bayes_spoofs)] prok_apw_ics = [mean(map(motif_ic,xs)) for xs in prok_apw_spoofs] prok_apw_mis = [mean(map(mi_per_col,xs)) for xs in prok_apw_spoofs] prok_ics_pp = map(motif_ic_per_col, prok_motifs) prok_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_maxent_spoofs] prok_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_psfm_spoofs] prok_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_bayes_spoofs] prok_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_apw_spoofs] #-------- euk_ics = map(motif_ic, tqdm(euk_submotifs)) euk_mis = map(mi_per_col, tqdm(euk_submotifs)) euk_maxent_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_maxent_spoofs)] euk_maxent_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_maxent_spoofs)] euk_psfm_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_psfm_spoofs)] euk_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_psfm_spoofs)] euk_bayes_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_bayes_spoofs)] euk_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_bayes_spoofs)] euk_apw_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_apw_spoofs)] euk_apw_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_apw_spoofs)] euk_ics_pp = map(motif_ic_per_col, euk_motifs) euk_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_maxent_spoofs] euk_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_psfm_spoofs] euk_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_bayes_spoofs] euk_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_apw_spoofs] #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7 ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1 #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5 ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85 mi_xticks = [0, 0.25, 0.5, 0.75, 1] ic_yticks = [0, 0.5, 1, 1.5, 2] revscatter = lambda xs, ys:scatter(ys, xs) sns.set_style('dark') plt.subplot(4,4,1) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4,4,3) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,5) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("PSFM",fontsize='large') plt.subplot(4,4,7) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,9) plt.xticks([]) #plt.yticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_apw_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_apw_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4,4,11) plt.xticks([]) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_apw_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,13) #plt.xticks([]) plt.yticks(ic_yticks, ic_yticks) plt.xticks(ic_yticks, ic_yticks) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok IC",fontsize='large') plt.ylabel("Bayes",fontsize='large') plt.subplot(4,4,15) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks(mi_xticks, mi_xticks) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(prok_mis, prok_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.xlabel("Prok MI",fontsize='large') #--- euk plots ---# plt.subplot(4,4,2) plt.xticks([]) plt.yticks([]) r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_maxent_ics_pp))) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("MaxEnt",fontsize='large') plt.subplot(4,4,4) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_maxent_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_maxent_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,6) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_psfm_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("PSFM",fontsize='large') plt.subplot(4,4,8) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_psfm_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_psfm_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,10) plt.xticks([]) plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_apw_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_apw_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("APW",fontsize='large') #plt.xlabel("IC (bits)",fontsize='large') plt.subplot(4,4,12) plt.xticks([]) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_apw_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_apw_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) plt.subplot(4,4,14) #plt.xticks([]) # plt.yticks([]) plt.xlim(ic_min, ic_max) plt.ylim(ic_min, ic_max) r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_bayes_ics_pp))) plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2)) plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.ylabel("Bayes",fontsize='large') plt.xlabel("Euk IC",fontsize='large') plt.subplot(4,4,16) #plt.xticks([]) plt.xticks(mi_xticks, mi_xticks) plt.yticks([]) plt.xlim(mi_min, mi_max) plt.ylim(mi_min, mi_max) r, p = revscatter(euk_mis, euk_bayes_mis) rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_bayes_mis))) plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2)) plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd) #plt.axis('off') #plt.xlabel("MI (bits/column pair)",fontsize='large') plt.xlabel("Euk MI",fontsize='large') plt.tight_layout() maybesave("spoof-statistics-rmsd.pdf")
def experiment2(trials=100): """APW models win, presumably because they have larger sigmas""" scatter(*transpose([map(lambda xs:mean(map(log,xs)),experiment2_()) for i in trange(trials)]))