Пример #1
0
def rejection_sample_site((matrix, mu, Ne)):
    psfm = psfm_from_matrix(matrix)
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site:score_seq(log_psfm, site)
    log_M = -sum(map(max,psfm))
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites]
    log_qs = [log_psfm_prob(site) for site in sites]
    ars = [exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs)]
Пример #2
0
def rejection_sample_site((matrix, mu, Ne)):
    psfm = psfm_from_matrix(matrix)
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site: score_seq(log_psfm, site)
    log_M = -sum(map(max, psfm))
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites]
    log_qs = [log_psfm_prob(site) for site in sites]
    ars = [
        exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs)
    ]
Пример #3
0
def predict_ic_from_theta(theta, L):
    sigma, mu, Ne = theta
    nu = Ne - 1
    ep_star = mu - log(Ne - 1)
    matrix = sample_matrix(L, sigma)
    ep_min = sum(map(min, matrix))
    des_ep = max(ep_star, ep_min + 1)
    def f(lamb):
        psfm = psfm_from_matrix(matrix, lamb)
        return sum([sum(ep*p for ep,p in zip(eps, ps)) for eps, ps in zip(matrix, psfm)]) - des_ep
    log_psfm = [[log(p) for p in ps] for ps in psfm]
    lamb = bisect_interval(f,-20,20)
    sites = ([sample_from_psfm(psfm) for i in range(100)])
    log_ps = [-nu*log(1+exp(score_seq(matrix, site) - mu)) for site in sites]
    log_qs = [score_seq(log_psfm, site) for site in sites]
Пример #4
0
def predict_ic_from_theta(theta, L):
    sigma, mu, Ne = theta
    nu = Ne - 1
    ep_star = mu - log(Ne - 1)
    matrix = sample_matrix(L, sigma)
    ep_min = sum(map(min, matrix))
    des_ep = max(ep_star, ep_min + 1)

    def f(lamb):
        psfm = psfm_from_matrix(matrix, lamb)
        return sum([
            sum(ep * p for ep, p in zip(eps, ps))
            for eps, ps in zip(matrix, psfm)
        ]) - des_ep

    log_psfm = [[log(p) for p in ps] for ps in psfm]
    lamb = bisect_interval(f, -20, 20)
    sites = ([sample_from_psfm(psfm) for i in range(100)])
    log_ps = [
        -nu * log(1 + exp(score_seq(matrix, site) - mu)) for site in sites
    ]
    log_qs = [score_seq(log_psfm, site) for site in sites]
Пример #5
0
def main_experiment(generate_data=False):
    if generate_data:
        iterations = 10000
        prok_chains = [
            posterior_chain2(motif, iterations=iterations)
            for motif in tqdm(prok_motifs)
        ]
        prok_bayes_spoofs = [[
            motif_from_theta(theta, len(motif))
            for theta in tqdm(chain[iterations / 2::500])
        ] for chain, motif in tqdm(zip(prok_chains, prok_motifs))]
        prok_psfms = [
            psfm_from_motif(motif, pc=1 / 4.0) for motif in prok_motifs
        ]
        prok_psfm_spoofs = [[[
            sample_from_psfm(psfm) for _ in range(len(motif))
        ] for _ in range(10)] for psfm, motif in zip(prok_psfms, prok_motifs)]
        prok_maxent_spoofs = [
            spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs)
        ]
        prok_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0),
                        tqdm(prok_motifs))
        prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))]
                            for __ in range(10)]
                           for apw, motif in tqdm(zip(prok_apws, prok_motifs))]
        euk_submotifs = map(subsample, euk_motifs)
        euk_chains = [
            posterior_chain2(motif, iterations=iterations)
            for motif in tqdm(euk_submotifs)
        ]
        euk_bayes_spoofs = [[
            motif_from_theta(theta, len(motif))
            for theta in tqdm(chain[iterations / 2::500])
        ] for chain, motif in tqdm(zip(euk_chains, euk_submotifs))]
        euk_psfms = [
            psfm_from_motif(motif, pc=1 / 4.0) for motif in euk_submotifs
        ]
        euk_psfm_spoofs = [[[
            sample_from_psfm(psfm) for _ in range(len(motif))
        ] for _ in range(10)] for psfm, motif in zip(euk_psfms, euk_submotifs)]
        euk_maxent_spoofs = [
            spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs)
        ]
        euk_apws = map(lambda m: code_from_motif(m, pc=1 / 16.0),
                       tqdm(euk_submotifs))
        euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))]
                           for __ in range(10)]
                          for apw, motif in tqdm(zip(euk_apws, euk_submotifs))]
        with open("prok_chains.pkl", 'w') as f:
            cPickle.dump(prok_chains, f)
        with open("prok_bayes_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_bayes_spoofs, f)
        with open("prok_maxent_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_maxent_spoofs, f)
        with open("prok_psfm_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_psfm_spoofs, f)
        with open("prok_apw_spoofs.pkl", 'w') as f:
            cPickle.dump(prok_apw_spoofs, f)

        with open("euk_submotifs.pkl", 'w') as f:
            cPickle.dump(euk_submotifs, f)
        with open("euk_chains.pkl", 'w') as f:
            cPickle.dump(euk_chains, f)
        with open("euk_bayes_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_bayes_spoofs, f)
        with open("euk_maxent_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_maxent_spoofs, f)
        with open("euk_psfm_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_psfm_spoofs, f)
        with open("euk_apw_spoofs.pkl", 'w') as f:
            cPickle.dump(euk_apw_spoofs, f)

    else:
        with open("prok_chains.pkl") as f:
            prok_chains = cPickle.load(f)
        with open("prok_bayes_spoofs.pkl") as f:
            prok_bayes_spoofs = cPickle.load(f)
        with open("prok_maxent_spoofs.pkl") as f:
            prok_maxent_spoofs = cPickle.load(f)
        with open("prok_psfm_spoofs.pkl") as f:
            prok_psfm_spoofs = cPickle.load(f)
        with open("prok_apw_spoofs.pkl") as f:
            prok_apw_spoofs = cPickle.load(f)

        with open("euk_submotifs.pkl") as f:
            euk_submotifs = cPickle.load(f)
        with open("euk_chains.pkl") as f:
            euk_chains = cPickle.load(f)
        with open("euk_bayes_spoofs.pkl") as f:
            euk_bayes_spoofs = cPickle.load(f)
        with open("euk_maxent_spoofs.pkl") as f:
            euk_maxent_spoofs = cPickle.load(f)
        with open("euk_apw_spoofs.pkl") as f:
            euk_apw_spoofs = cPickle.load(f)
        with open("euk_psfm_spoofs.pkl") as f:
            euk_psfm_spoofs = cPickle.load(f)

    #--------
    prok_ics = map(motif_ic, prok_motifs)
    prok_mis = map(mi_per_col, prok_motifs)
    prok_maxent_ics = [mean(map(motif_ic, xs)) for xs in prok_maxent_spoofs]
    prok_maxent_mis = [mean(map(mi_per_col, xs)) for xs in prok_maxent_spoofs]
    prok_psfm_ics = [mean(map(motif_ic, xs)) for xs in prok_psfm_spoofs]
    prok_psfm_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(prok_psfm_spoofs)
    ]
    prok_bayes_ics = [mean(map(motif_ic, xs)) for xs in prok_bayes_spoofs]
    prok_bayes_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(prok_bayes_spoofs)
    ]
    prok_apw_ics = [mean(map(motif_ic, xs)) for xs in prok_apw_spoofs]
    prok_apw_mis = [mean(map(mi_per_col, xs)) for xs in prok_apw_spoofs]

    prok_ics_pp = map(motif_ic_per_col, prok_motifs)
    prok_maxent_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_maxent_spoofs
    ]
    prok_psfm_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_psfm_spoofs
    ]
    prok_bayes_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_bayes_spoofs
    ]
    prok_apw_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in prok_apw_spoofs
    ]

    #--------
    euk_ics = map(motif_ic, tqdm(euk_submotifs))
    euk_mis = map(mi_per_col, tqdm(euk_submotifs))
    euk_maxent_ics = [
        mean(map(motif_ic, xs)) for xs in tqdm(euk_maxent_spoofs)
    ]
    euk_maxent_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(euk_maxent_spoofs)
    ]
    euk_psfm_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_psfm_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_bayes_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_bayes_spoofs)]
    euk_bayes_mis = [
        mean(map(mi_per_col, xs)) for xs in tqdm(euk_bayes_spoofs)
    ]
    euk_apw_ics = [mean(map(motif_ic, xs)) for xs in tqdm(euk_apw_spoofs)]
    euk_apw_mis = [mean(map(mi_per_col, xs)) for xs in tqdm(euk_apw_spoofs)]

    euk_ics_pp = map(motif_ic_per_col, euk_motifs)
    euk_maxent_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in euk_maxent_spoofs
    ]
    euk_psfm_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in euk_psfm_spoofs
    ]
    euk_bayes_ics_pp = [
        mean(map(motif_ic_per_col, xs)) for xs in euk_bayes_spoofs
    ]
    euk_apw_ics_pp = [mean(map(motif_ic_per_col, xs)) for xs in euk_apw_spoofs]

    #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7
    ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1
    #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5
    ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85
    mi_xticks = [0, 0.25, 0.5, 0.75, 1]
    ic_yticks = [0, 0.5, 1, 1.5, 2]
    revscatter = lambda xs, ys: scatter(ys, xs)
    sns.set_style('dark')
    plt.subplot(4, 4, 1)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp,
                     prok_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("MaxEnt", fontsize='large')
    plt.subplot(4, 4, 3)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_maxent_mis)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 5)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("PSFM", fontsize='large')
    plt.subplot(4, 4, 7)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis,
                             prok_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 9)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_apw_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("APW", fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4, 4, 11)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 13)
    #plt.xticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_ics_pp, prok_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok IC", fontsize='large')
    plt.ylabel("Bayes", fontsize='large')
    plt.subplot(4, 4, 15)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_bayes_mis)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, prok_mis, prok_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok MI", fontsize='large')

    #--- euk plots ---#
    plt.subplot(4, 4, 2)
    plt.xticks([])
    plt.yticks([])
    r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("MaxEnt",fontsize='large')
    plt.subplot(4, 4, 4)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_maxent_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis,
                             euk_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 6)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("PSFM",fontsize='large')
    plt.subplot(4, 4, 8)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 10)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_apw_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("APW",fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4, 4, 12)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4, 4, 14)
    #plt.xticks([])
    #
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp)
    rmsd = sqrt(
        mean(zipWith(lambda x, y: (x - y)**2, euk_ics_pp, euk_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("Bayes",fontsize='large')
    plt.xlabel("Euk IC", fontsize='large')
    plt.subplot(4, 4, 16)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_bayes_mis)
    rmsd = sqrt(mean(zipWith(lambda x, y: (x - y)**2, euk_mis, euk_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext * 0.8, s='$RMSD$ = %1.3f' % rmsd)
    #plt.axis('off')
    #plt.xlabel("MI (bits/column pair)",fontsize='large')
    plt.xlabel("Euk MI", fontsize='large')
    plt.tight_layout()
    maybesave("spoof-statistics-rmsd.pdf")
Пример #6
0
 def prop(_):
     return sample_from_psfm(tilted_psfm)
Пример #7
0
    mat_mu = sum(map(mean, matrix))
    mat_sigma = sqrt(sum(map(lambda xs: variance(xs, correct=False), matrix)))
    log_perc_below_threshold = norm.logcdf(mu - log((Ne - 1)), mat_mu,
                                           mat_sigma)
    log_Zs = L * log(4) + log_perc_below_threshold
    ans_ref = ((N * L * log(4)) + log_perc_below_threshold)
    ans = N * log_Zs
    return ans


def log_ZS_importance((matrix, mu, Ne), trials=1000):
    L = len(matrix)
    psfm = psfm_from_matrix(matrix)
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site: score_seq(log_psfm, site)
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    mean_ZS = mean(
        exp(
            log_fhat((matrix, mu, Ne), [site]) + log(1.0 / 4**L) -
            log_psfm_prob(site)) for site in sites)
    ZS = 4**L * mean_ZS
    return log(ZS)


def log_ZS_importance_ref((matrix, mu, Ne), trials=1000):
    L = len(matrix)
    psfm = [[0.25] * 4 for _ in range(L)]
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site: score_seq(log_psfm, site)
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    mean_ZS = mean(
Пример #8
0
 def prop(_):
     return sample_from_psfm(tilted_psfm)
Пример #9
0
def main_experiment(generate_data=False):
    if generate_data:
        iterations = 10000
        prok_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(prok_motifs)]
        prok_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])]
                       for chain, motif in tqdm(zip(prok_chains, prok_motifs))]
        prok_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in prok_motifs]
        prok_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)]
                            for psfm, motif in zip(prok_psfms, prok_motifs)]
        prok_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(prok_motifs)]
        prok_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(prok_motifs))
        prok_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)]
                             for apw, motif in tqdm(zip(prok_apws,prok_motifs))]
        euk_submotifs = map(subsample, euk_motifs)
        euk_chains = [posterior_chain2(motif,iterations=iterations) for motif in tqdm(euk_submotifs)]
        euk_bayes_spoofs = [[motif_from_theta(theta, len(motif)) for theta in tqdm(chain[iterations/2::500])]
                            for chain, motif in tqdm(zip(euk_chains, euk_submotifs))]
        euk_psfms = [psfm_from_motif(motif, pc=1/4.0) for motif in euk_submotifs]
        euk_psfm_spoofs = [[[sample_from_psfm(psfm) for _ in range(len(motif))] for _ in range(10)]
                           for psfm, motif in zip(euk_psfms, euk_submotifs)]
        euk_maxent_spoofs = [spoof_maxent_motifs(motif, 10) for motif in tqdm(euk_submotifs)]
        euk_apws = map(lambda m:code_from_motif(m, pc=1/16.0),tqdm(euk_submotifs))
        euk_apw_spoofs = [[[sample_site(apw) for _ in range(len(motif))] for __ in range(10)]
                          for apw, motif in tqdm(zip(euk_apws,euk_submotifs))]
        with open("prok_chains.pkl",'w') as f:
            cPickle.dump(prok_chains,f)
        with open("prok_bayes_spoofs.pkl",'w') as f:
            cPickle.dump(prok_bayes_spoofs,f)
        with open("prok_maxent_spoofs.pkl",'w') as f:
            cPickle.dump(prok_maxent_spoofs,f)
        with open("prok_psfm_spoofs.pkl",'w') as f:
            cPickle.dump(prok_psfm_spoofs,f)
        with open("prok_apw_spoofs.pkl",'w') as f:
            cPickle.dump(prok_apw_spoofs,f)

        with open("euk_submotifs.pkl",'w') as f:
            cPickle.dump(euk_submotifs,f)
        with open("euk_chains.pkl",'w') as f:
            cPickle.dump(euk_chains,f)
        with open("euk_bayes_spoofs.pkl",'w') as f:
            cPickle.dump(euk_bayes_spoofs,f)
        with open("euk_maxent_spoofs.pkl",'w') as f:
            cPickle.dump(euk_maxent_spoofs,f)
        with open("euk_psfm_spoofs.pkl",'w') as f:
            cPickle.dump(euk_psfm_spoofs,f)
        with open("euk_apw_spoofs.pkl",'w') as f:
            cPickle.dump(euk_apw_spoofs,f)

    else:
        with open("prok_chains.pkl") as f:
            prok_chains = cPickle.load(f)
        with open("prok_bayes_spoofs.pkl") as f:
            prok_bayes_spoofs = cPickle.load(f)
        with open("prok_maxent_spoofs.pkl") as f:
            prok_maxent_spoofs = cPickle.load(f)
        with open("prok_psfm_spoofs.pkl") as f:
            prok_psfm_spoofs = cPickle.load(f)
        with open("prok_apw_spoofs.pkl") as f:
            prok_apw_spoofs = cPickle.load(f)

        with open("euk_submotifs.pkl") as f:
            euk_submotifs = cPickle.load(f)
        with open("euk_chains.pkl") as f:
            euk_chains = cPickle.load(f)
        with open("euk_bayes_spoofs.pkl") as f:
            euk_bayes_spoofs = cPickle.load(f)
        with open("euk_maxent_spoofs.pkl") as f:
            euk_maxent_spoofs = cPickle.load(f)
        with open("euk_apw_spoofs.pkl") as f:
            euk_apw_spoofs = cPickle.load(f)
        with open("euk_psfm_spoofs.pkl") as f:
            euk_psfm_spoofs = cPickle.load(f)

    #--------
    prok_ics = map(motif_ic, prok_motifs)
    prok_mis = map(mi_per_col, prok_motifs)
    prok_maxent_ics = [mean(map(motif_ic,xs)) for xs in prok_maxent_spoofs]
    prok_maxent_mis = [mean(map(mi_per_col,xs)) for xs in prok_maxent_spoofs]
    prok_psfm_ics = [mean(map(motif_ic,xs)) for xs in prok_psfm_spoofs]
    prok_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_psfm_spoofs)]
    prok_bayes_ics = [mean(map(motif_ic,xs)) for xs in prok_bayes_spoofs]
    prok_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(prok_bayes_spoofs)]
    prok_apw_ics = [mean(map(motif_ic,xs)) for xs in prok_apw_spoofs]
    prok_apw_mis = [mean(map(mi_per_col,xs)) for xs in prok_apw_spoofs]

    prok_ics_pp = map(motif_ic_per_col, prok_motifs)
    prok_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_maxent_spoofs]
    prok_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_psfm_spoofs]
    prok_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_bayes_spoofs]
    prok_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in prok_apw_spoofs]
    

    #--------
    euk_ics = map(motif_ic, tqdm(euk_submotifs))
    euk_mis = map(mi_per_col, tqdm(euk_submotifs))
    euk_maxent_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_maxent_spoofs)]
    euk_maxent_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_maxent_spoofs)]
    euk_psfm_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_psfm_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_psfm_spoofs)]
    euk_bayes_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_bayes_spoofs)]
    euk_bayes_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_bayes_spoofs)]
    euk_apw_ics = [mean(map(motif_ic,xs)) for xs in tqdm(euk_apw_spoofs)]
    euk_apw_mis = [mean(map(mi_per_col,xs)) for xs in tqdm(euk_apw_spoofs)]

    euk_ics_pp = map(motif_ic_per_col, euk_motifs)
    euk_maxent_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_maxent_spoofs]
    euk_psfm_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_psfm_spoofs]
    euk_bayes_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_bayes_spoofs]
    euk_apw_ics_pp = [mean(map(motif_ic_per_col,xs)) for xs in euk_apw_spoofs]



    #ic_min, ic_max, mi_min, mi_max = 4.5, 25, -0.1, 0.7
    ic_min, ic_max, mi_min, mi_max = -.1, 2.6, -0.05, 1
    #ic_xtext, ic_ytext, mi_xtext, mi_ytext = 5, 20, -0.05, 0.5
    ic_xtext, ic_ytext, mi_xtext, mi_ytext = -0.05, 2.2, -0.05, 0.85
    mi_xticks = [0, 0.25, 0.5, 0.75, 1]
    ic_yticks = [0, 0.5, 1, 1.5, 2]
    revscatter = lambda xs, ys:scatter(ys, xs)
    sns.set_style('dark')
    plt.subplot(4,4,1)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    r, p = revscatter(prok_ics_pp, prok_maxent_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("MaxEnt",fontsize='large')
    plt.subplot(4,4,3)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_maxent_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,5)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_psfm_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("PSFM",fontsize='large')
    plt.subplot(4,4,7)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,9)
    plt.xticks([])
    #plt.yticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_apw_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.ylabel("APW",fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4,4,11)
    plt.xticks([])
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,13)
    #plt.xticks([])
    plt.yticks(ic_yticks, ic_yticks)
    plt.xticks(ic_yticks, ic_yticks)
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(prok_ics_pp, prok_bayes_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_ics_pp, prok_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok IC",fontsize='large')
    plt.ylabel("Bayes",fontsize='large')
    plt.subplot(4,4,15)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks(mi_xticks, mi_xticks)
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(prok_mis, prok_bayes_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, prok_mis, prok_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.xlabel("Prok MI",fontsize='large')

    #--- euk plots ---#
    plt.subplot(4,4,2)
    plt.xticks([])
    plt.yticks([])
    r, p = revscatter(euk_ics_pp, euk_maxent_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_maxent_ics_pp)))
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("MaxEnt",fontsize='large')
    plt.subplot(4,4,4)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_maxent_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_maxent_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,6)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_psfm_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_psfm_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("PSFM",fontsize='large')
    plt.subplot(4,4,8)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_psfm_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_psfm_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,10)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_apw_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_apw_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("APW",fontsize='large')
    #plt.xlabel("IC (bits)",fontsize='large')
    plt.subplot(4,4,12)
    plt.xticks([])
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_apw_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_apw_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    plt.subplot(4,4,14)
    #plt.xticks([])
    #
    plt.yticks([])
    plt.xlim(ic_min, ic_max)
    plt.ylim(ic_min, ic_max)
    r, p = revscatter(euk_ics_pp, euk_bayes_ics_pp)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_ics_pp, euk_bayes_ics_pp)))
    plt.text(ic_xtext, ic_ytext,s='$r^2$ = %1.3f' % (r**2))
    plt.text(ic_xtext, ic_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.ylabel("Bayes",fontsize='large')
    plt.xlabel("Euk IC",fontsize='large')
    plt.subplot(4,4,16)
    #plt.xticks([])
    plt.xticks(mi_xticks, mi_xticks)
    plt.yticks([])
    plt.xlim(mi_min, mi_max)
    plt.ylim(mi_min, mi_max)
    r, p = revscatter(euk_mis, euk_bayes_mis)
    rmsd = sqrt(mean(zipWith(lambda x,y:(x-y)**2, euk_mis, euk_bayes_mis)))
    plt.text(mi_xtext, mi_ytext, s='$r^2$ = %1.3f' % (r**2))
    plt.text(mi_xtext, mi_ytext*0.8,s='$RMSD$ = %1.3f' % rmsd)
    #plt.axis('off')
    #plt.xlabel("MI (bits/column pair)",fontsize='large')
    plt.xlabel("Euk MI",fontsize='large')
    plt.tight_layout()
    maybesave("spoof-statistics-rmsd.pdf")
Пример #10
0
def log_Z_hack((matrix, mu, Ne), N):
    L = len(matrix)
    mat_mu = sum(map(mean,matrix))
    mat_sigma = sqrt(sum(map(lambda xs:variance(xs,correct=False), matrix)))
    log_perc_below_threshold = norm.logcdf(mu - log((Ne-1)), mat_mu, mat_sigma)
    log_Zs = L * log(4) + log_perc_below_threshold
    ans_ref = ((N*L * log(4)) +  log_perc_below_threshold)
    ans = N * log_Zs
    return ans

def log_ZS_importance((matrix, mu, Ne), trials=1000):
    L = len(matrix)
    psfm = psfm_from_matrix(matrix)
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site:score_seq(log_psfm, site)
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    mean_ZS = mean(exp(log_fhat((matrix, mu, Ne), [site]) + log(1.0/4**L) - log_psfm_prob(site))
                  for site in sites)
    ZS = 4**L * mean_ZS
    return log(ZS)

def log_ZS_importance_ref((matrix, mu, Ne), trials=1000):
    L = len(matrix)
    psfm = [[0.25]*4 for _ in range(L)]
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site:score_seq(log_psfm, site)
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    mean_ZS = mean(exp(log_fhat((matrix, mu, Ne), [site]) + log(1.0/4**L) - log_psfm_prob(site))
                  for site in sites)
    ZS = 4**L * mean_ZS
    return log(ZS)