Пример #1
0
def restriction_of_range_half_site_experiment(motif):
    """is energy of first half-site negatively correlated with energy of second half-site?"""
    L = len(motif[0])
    l = L/2
    mat = matrix_from_motif(motif)
    eps1 = [score_seq(mat[:l], site[:l]) for site in motif]
    eps2 = [score_seq(mat[l:], site[l:]) for site in motif]
    return pearsonr(eps1,eps2)
Пример #2
0
def restriction_of_range_motif_spoof_experiment(motifs):
    all_eps = []
    all_spoof_eps = []
    for motif in tqdm(motifs):
        mat = matrix_from_motif(motif)
        eps = [score_seq(mat, site) for site in motif]
        spoofs = spoof_psfm(motif, pc=0)
        spoof_eps = [score_seq(mat, site) for site in spoofs]
        all_eps.append(eps)
        all_spoof_eps.append(spoof_eps)
    return all_eps, all_spoof_eps
Пример #3
0
def predict_ic_from_theta(theta, L):
    sigma, mu, Ne = theta
    nu = Ne - 1
    ep_star = mu - log(Ne - 1)
    matrix = sample_matrix(L, sigma)
    ep_min = sum(map(min, matrix))
    des_ep = max(ep_star, ep_min + 1)
    def f(lamb):
        psfm = psfm_from_matrix(matrix, lamb)
        return sum([sum(ep*p for ep,p in zip(eps, ps)) for eps, ps in zip(matrix, psfm)]) - des_ep
    log_psfm = [[log(p) for p in ps] for ps in psfm]
    lamb = bisect_interval(f,-20,20)
    sites = ([sample_from_psfm(psfm) for i in range(100)])
    log_ps = [-nu*log(1+exp(score_seq(matrix, site) - mu)) for site in sites]
    log_qs = [score_seq(log_psfm, site) for site in sites]
Пример #4
0
def sample_motif_ar_tilted(matrix, mu, Ne, N):
    nu = Ne - 1
    L = len(matrix)
    ep_min, ep_max, L = sum(map(min, matrix)), sum(map(max,
                                                       matrix)), len(matrix)
    site_sigma = site_sigma_from_matrix(matrix)
    density = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) * dnorm(
        ep, 0, site_sigma) * (ep_min <= ep <= ep_max)
    d_density = lambda ep: ep / site_sigma**2 + nu / (1 + exp(mu - ep))
    phat = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1)
    mode = bisect_interval(d_density, -100, 100)
    if mode < ep_min:
        mode = ep_min + 1  # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit
    dmode = density(mode)
    # calculate mean epsilon via rejection sampling
    motif = []

    def mean_ep(lamb):
        psfm = psfm_from_matrix(matrix, lamb=lamb)
        return sum([
            ep * p for (mat_row, psfm_row) in zip(matrix, psfm)
            for (ep, p) in zip(mat_row, psfm_row)
        ])

    lamb = bisect_interval(lambda l: mean_ep(l) - mode, -20, 20)
    tilted_psfm = psfm_from_matrix(matrix, lamb=lamb)
    log_tilted_psfm = [map(log, row) for row in tilted_psfm]
    while len(motif) < N:
        site = random_site(L)
        ep = score_seq(matrix, site)
        if random.random() < phat(ep) / pmode:
            motif.append(site)
    return motif
def experiment3(trials=10):
    mu = -10
    Ne = 5
    L = 10
    sigma = 1
    codes = [sample_code(L, sigma) for i in range(trials)]
    pssms = [sample_matrix(L, sigma) for i in range(trials)]
    sites = [random_site(L) for i in xrange(10000)]
    apw_site_sigmas = [sd([score(code,site) for site in sites]) for code in codes]
    linear_site_sigmas = [sd([score_seq(pssm,site) for site in sites]) for pssm in pssms]
    def apw_phat(code, site):
        ep = score(code, site)
        return 1/(1+exp(ep-mu))**(Ne-1)
    def apw_occ(code, site):
        ep = score(code, site)
        return 1/(1+exp(ep-mu))
    def linear_phat(pssm, site):
        ep = score_seq(pssm, site)
        return 1/(1+exp(ep-mu))**(Ne-1)
    def linear_occ(pssm, site):
        ep = score_seq(pssm, site)
        return 1/(1+exp(ep-mu))
    apw_mean_fits = [exp(mean(map(log10, mh(lambda s:apw_phat(code, s), proposal=mutate_site, x0=random_site(L),
                                          capture_state = lambda s:apw_occ(code, s))[1:])))
                         for code in tqdm(codes)]
    linear_mean_fits = [exp(mean(map(log10, mh(lambda s:linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L),
                                             capture_state = lambda s:linear_occ(pssm, s))[1:])))
                        for pssm in tqdm(pssms)]
    plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw')
    plt.scatter(linear_site_sigmas, linear_mean_fits, color='g',label='linear')
    plt.semilogy()
    plt.legend(loc='lower right')
def fitness(matrix,motif,G):
    """multiplicative fitness of occupancy over all sites"""
    eps = [score_seq(matrix,site) for site in motif]
    fgs = [exp(-ep) for ep in eps]
    Zb = Zb_from_matrix(matrix,G)
    Z = sum(fgs) + Zb
    return prod(fg/Z for fg in fgs)
Пример #7
0
def sample_motif_ar_tilted(matrix, mu, Ne, N):
    nu = Ne - 1
    L = len(matrix)
    ep_min, ep_max, L = sum(map(min,matrix)), sum(map(max,matrix)), len(matrix)
    site_sigma = site_sigma_from_matrix(matrix)
    density = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) * dnorm(ep,0,site_sigma)*(ep_min <= ep <= ep_max)
    d_density = lambda ep:ep/site_sigma**2 + nu/(1+exp(mu-ep))
    phat = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1)
    mode = bisect_interval(d_density, -100, 100)
    if mode < ep_min:
        mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit
    dmode = density(mode)
    # calculate mean epsilon via rejection sampling
    motif = []
    def mean_ep(lamb):
        psfm = psfm_from_matrix(matrix, lamb=lamb)
        return sum([ep * p for (mat_row, psfm_row) in zip(matrix, psfm)
                    for (ep, p) in zip(mat_row, psfm_row)])
    lamb = bisect_interval(lambda l:mean_ep(l) - mode, -20, 20)
    tilted_psfm = psfm_from_matrix(matrix, lamb=lamb)
    log_tilted_psfm = [map(log,row) for row in tilted_psfm]
    while len(motif) < N:
        site = random_site(L)
        ep = score_seq(matrix, site)
        if random.random() < phat(ep)/pmode:
            motif.append(site)    
    return motif
Пример #8
0
def restriction_of_range_loo_experiment(motif):
    """can energy of a given position be predicted from energy of remaining bases?"""
    L = len(motif[0])
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat,site) for site in motif]
    mean_ep = mean(eps)
    results = []
    for j in range(L):
        print j
        loo_mat = mat[:j] + mat[j+1:]
        for site in motif:
            loo_ep = score_seq(loo_mat,site[:j] + site[j+1:])
            pred_ep = mean_ep - loo_ep
            obs_ep = score_seq([mat[j]],[site[j]])
            results.append((pred_ep, obs_ep))
    return results
Пример #9
0
def experiment3(trials=10):
    mu = -10
    Ne = 5
    L = 10
    sigma = 1
    codes = [sample_code(L, sigma) for i in range(trials)]
    pssms = [sample_matrix(L, sigma) for i in range(trials)]
    sites = [random_site(L) for i in xrange(10000)]
    apw_site_sigmas = [
        sd([score(code, site) for site in sites]) for code in codes
    ]
    linear_site_sigmas = [
        sd([score_seq(pssm, site) for site in sites]) for pssm in pssms
    ]

    def apw_phat(code, site):
        ep = score(code, site)
        return 1 / (1 + exp(ep - mu))**(Ne - 1)

    def apw_occ(code, site):
        ep = score(code, site)
        return 1 / (1 + exp(ep - mu))

    def linear_phat(pssm, site):
        ep = score_seq(pssm, site)
        return 1 / (1 + exp(ep - mu))**(Ne - 1)

    def linear_occ(pssm, site):
        ep = score_seq(pssm, site)
        return 1 / (1 + exp(ep - mu))

    apw_mean_fits = [
        exp(
            mean(
                map(
                    log10,
                    mh(lambda s: apw_phat(code, s),
                       proposal=mutate_site,
                       x0=random_site(L),
                       capture_state=lambda s: apw_occ(code, s))[1:])))
        for code in tqdm(codes)
    ]
    linear_mean_fits = [
        exp(
            mean(
                map(
                    log10,
                    mh(lambda s: linear_phat(pssm, s),
                       proposal=mutate_site,
                       x0=random_site(L),
                       capture_state=lambda s: linear_occ(pssm, s))[1:])))
        for pssm in tqdm(pssms)
    ]
    plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw')
    plt.scatter(linear_site_sigmas,
                linear_mean_fits,
                color='g',
                label='linear')
    plt.semilogy()
    plt.legend(loc='lower right')
def cv_experiment(motifs, target='uniform'):
    """see if js_psfm outperforms ml_psfm in 10x cv"""
    all_mls, all_js = [], []
    for motif in motifs:
        ml_lls = []
        js_lls = []
        for train, test in cv(motif):
            ml_mat = mmap(log, psfm_from_motif(train))
            js_mat = mmap(log, js_psfm(train, target=target))
            ml_ll = mean(score_seq(ml_mat, site) for site in test)
            js_ll = mean(score_seq(js_mat, site) for site in test)
            ml_lls.append(ml_ll)
            js_lls.append(js_ll)
        avg_ml_ll, avg_js_ll = mean(ml_lls), mean(js_lls)
        all_mls.append(avg_ml_ll)
        all_js.append(avg_js_ll)
        print avg_ml_ll, avg_js_ll, avg_ml_ll < avg_js_ll
    return all_mls, all_js
Пример #11
0
def rejection_sample_site((matrix, mu, Ne)):
    psfm = psfm_from_matrix(matrix)
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site:score_seq(log_psfm, site)
    log_M = -sum(map(max,psfm))
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites]
    log_qs = [log_psfm_prob(site) for site in sites]
    ars = [exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs)]
Пример #12
0
def select_sites_by_occupancy(matrix, mu, n):
    L = len(matrix)
    motif = []
    while len(motif) < n:
        site = random_site(L)
        if random.random() < 1 / (1 + exp(score_seq(matrix, site) - mu)):
            motif.append(site)
            print len(motif)
    return motif
Пример #13
0
def log_ZS_analytic((matrix, mu, Ne)):
    """compute log_Z analytically"""
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1 / (1 + exp(ep - mu)))**(Ne - 1)
    return log(acc)
Пример #14
0
def log_ZS_analytic((matrix, mu, Ne)):
    """compute log_Z analytically"""
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1/(1+exp(ep-mu)))**(Ne-1)
    return log(acc)
Пример #15
0
def posterior_chain2(motif,
                     iterations=50000,
                     theta0=None,
                     sigma=1,
                     num_spoof_sites="N",
                     verbose=False,
                     integration='hack'):
    """do MH, estimating ratio of partition functions empirically"""
    L = len(motif[0])
    N = len(motif)
    if num_spoof_sites == "N":
        num_spoof_sites = N  # should this be N or 1?
    if theta0 is None:
        matrix0 = [[random.gauss(0, 1) for _ in range(4)] for i in range(L)]
        mu0 = -10
        Ne0 = 2
        theta = (matrix0, mu0, Ne0)
    else:
        theta = theta0
    log_f_theta = log_fhat(theta, motif)
    #log_Z = log_ZM_gaussian(theta, N, integration=integration)
    log_Z = log_ZM_sophisticated(theta, N)
    chain = []
    acceptances = 0

    def log_prior((matrix, mu, Ne)):
        log_matrix_prior = sum(
            [log(dnorm(ep, 0, 1)) for row in matrix for ep in row])
        log_mu_prior = log(dnorm(mu, 0, 10))
        log_Ne_prior = log(exp(-Ne))
        return log_matrix_prior + log_mu_prior + log_Ne_prior

    for it in trange(iterations):
        #print "Ne:", theta[2]
        theta_p = prop2(theta, sigma)
        log_f_theta_p = log_fhat(theta_p, motif)
        matrix_p, mu_p, Ne_p = theta_p
        #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad')
        #log_Z_p = log_ZM_gaussian(theta_p, N, trials=100, integration='hack')
        #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad')
        log_Z_p = log_ZM_sophisticated(theta_p, N)
        #log_Z_p = log_ZM_importance(theta_p, N, trials=100)
        log_ar = log_f_theta_p - log_f_theta + (
            log_Z - log_Z_p) + log_prior(theta_p) - log_prior(theta)
        if log(random.random()) < log_ar:
            theta = theta_p
            log_f_theta = log_f_theta_p
            log_Z = log_Z_p
            acceptances += 1
        chain.append(theta)
        if verbose:
            print "log(f), log_Z:", log_f_theta, log_Z
            print "mean_ep:", mean(score_seq(theta[0], site) for site in motif)
            print "mean_occ:", mean(occs(theta, motif))
            print "mu, Ne:", theta[1], theta[2]
    print "acceptances:", acceptances / float(it + 1)
    return chain
def log_fitness(matrix,motif,G):
    """multiplicative fitness of occupancy over all sites"""
    n = len(motif)
    eps = [score_seq(matrix,site) for site in motif]
    fgs = [exp(-ep) for ep in eps]
    Zf = sum(fgs)
    Zb = Zb_from_matrix(matrix,G)
    Z = Zf + Zb
    return -sum(eps) - n*log(Z)
Пример #17
0
def rejection_sample_site((matrix, mu, Ne)):
    psfm = psfm_from_matrix(matrix)
    log_psfm = [[log(p) for p in row] for row in psfm]
    log_psfm_prob = lambda site: score_seq(log_psfm, site)
    log_M = -sum(map(max, psfm))
    sites = [sample_from_psfm(psfm) for _ in xrange(trials)]
    log_fs = [log_fhat((matrix, mu, Ne), [site]) for site in sites]
    log_qs = [log_psfm_prob(site) for site in sites]
    ars = [
        exp(log_f - (log_q + log_M)) for log_f, log_q in zip(log_fs, log_qs)
    ]
def ror_experiment():
    L = 10
    n = 100
    sigmas = np.linspace(0.1,10,10)
    alphas = np.linspace(0,1,10)
    for sigma in sigmas:
        for alpha in alphas:
            theta = - alpha * sigma * L
            matrix = sample_matrix(L,sigma)
            sampler = lambda : sample_motif_neglect_fg(matrix,1,Ne=2)[0]
            motif = sample_until(lambda site:score_seq(matrix,site) < theta,sampler,n)
            print sigma, alpha, total_motif_mi(motif)
Пример #19
0
def predict_ic_from_theta(theta, L):
    sigma, mu, Ne = theta
    nu = Ne - 1
    ep_star = mu - log(Ne - 1)
    matrix = sample_matrix(L, sigma)
    ep_min = sum(map(min, matrix))
    des_ep = max(ep_star, ep_min + 1)

    def f(lamb):
        psfm = psfm_from_matrix(matrix, lamb)
        return sum([
            sum(ep * p for ep, p in zip(eps, ps))
            for eps, ps in zip(matrix, psfm)
        ]) - des_ep

    log_psfm = [[log(p) for p in ps] for ps in psfm]
    lamb = bisect_interval(f, -20, 20)
    sites = ([sample_from_psfm(psfm) for i in range(100)])
    log_ps = [
        -nu * log(1 + exp(score_seq(matrix, site) - mu)) for site in sites
    ]
    log_qs = [score_seq(log_psfm, site) for site in sites]
Пример #20
0
def sample_site_imh(matrix, mu, Ne, lamb, iterations=None):
    nu = Ne - 1
    L = len(matrix)
    if iterations is None:
        iterations = 10*L
    log_phat = lambda site:-nu*log(1+exp(score_seq(matrix,site)-mu))
    tilted_psfm = psfm_from_matrix(matrix, lamb=lamb)
    log_tilted_psfm = [map(log,row) for row in tilted_psfm]
    def prop(_):
        return sample_from_psfm(tilted_psfm)
    def log_dprop(xp, _):
        return score_seq(log_tilted_psfm, xp)
    return mh(log_phat, proposal=prop, dprop=log_dprop, x0=prop(None), use_log=True)[-1]
def log_fitness_approx3(matrix,motif,G):
    n = len(motif)
    eps = [score_seq(matrix,site) for site in motif]
    fgs = [exp(-ep) for ep in eps]
    Zf = sum(fgs)
    Zb = Zb_from_matrix(matrix,G)
    Z = Zf + Zb
    print Zf,Zb,Zf/Zb
    good_approximation = -sum(eps) - n*(log(Zf))
    Zf_hat = mean(fgs)
    Zf_resids = [fg - Zf_hat for fg in fgs]
    worse_approximation = -sum(eps) - n*(log(n) + log(Zf_hat))
    print good_approximation, worse_approximation
    return good_approximation
Пример #22
0
def site_mh(matrix, mu, Ne, iterations=50000):
    site_mu, site_sigma = site_mu_from_matrix(matrix), site_sigma_from_matrix(
        matrix)
    L = len(matrix)
    nu = Ne - 1
    log_f = lambda site: log_Pe(score_seq(matrix, site), site_mu, site_sigma,
                                mu, Ne)
    #prop = lambda site:random_site(L)
    prop = lambda site: mutate_site(site)
    return mh(log_f,
              prop,
              x0=random_site(L),
              use_log=True,
              iterations=iterations)
Пример #23
0
def posterior_chain2(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites="N", verbose=False,
                     integration='hack'):
    """do MH, estimating ratio of partition functions empirically"""
    L = len(motif[0])
    N = len(motif)
    if num_spoof_sites == "N":
        num_spoof_sites = N  # should this be N or 1?
    if theta0 is None:
        matrix0 = [[random.gauss(0,1) for _ in range(4)] for i in range(L)]
        mu0 = -10
        Ne0 = 2
        theta = (matrix0, mu0, Ne0)
    else:
        theta = theta0
    log_f_theta = log_fhat(theta, motif)
    #log_Z = log_ZM_gaussian(theta, N, integration=integration)
    log_Z = log_ZM_sophisticated(theta, N)
    chain = []
    acceptances = 0
    def log_prior((matrix, mu, Ne)):
        log_matrix_prior = sum([log(dnorm(ep,0,1)) for row in matrix for ep in row])
        log_mu_prior = log(dnorm(mu,0,10))
        log_Ne_prior = log(exp(-Ne))
        return log_matrix_prior + log_mu_prior + log_Ne_prior
        
    for it in trange(iterations):
        #print "Ne:", theta[2]
        theta_p = prop2(theta, sigma)
        log_f_theta_p = log_fhat(theta_p, motif)
        matrix_p, mu_p, Ne_p = theta_p
        #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad')
        #log_Z_p = log_ZM_gaussian(theta_p, N, trials=100, integration='hack')
        #log_Z = log_ZM_gaussian(theta, N, trials=100, integration='quad')
        log_Z_p = log_ZM_sophisticated(theta_p, N)
        #log_Z_p = log_ZM_importance(theta_p, N, trials=100)
        log_ar = log_f_theta_p - log_f_theta  + (log_Z - log_Z_p) + log_prior(theta_p) - log_prior(theta)
        if log(random.random()) < log_ar:
            theta = theta_p
            log_f_theta = log_f_theta_p
            log_Z = log_Z_p
            acceptances += 1
        chain.append(theta)
        if verbose:
            print "log(f), log_Z:", log_f_theta, log_Z
            print "mean_ep:", mean(score_seq(theta[0],site) for site in motif)
            print "mean_occ:", mean(occs(theta, motif))
            print "mu, Ne:", theta[1], theta[2]
    print "acceptances:", acceptances/float(it+1)
    return chain
def log_fitness_approx(matrix,motif,G,terms=2):
    n = len(motif)
    eps = [score_seq(matrix,site) for site in motif]
    fgs = [exp(-ep) for ep in eps]
    Zf = sum(fgs)
    Zb = Zb_from_matrix(matrix,G)
    Z = Zf + Zb
    zeroth_term = log(n+Zb) * (terms >= 0)
    first_term = (-1/(n+Zb)*sum(eps)) * (terms >= 1)
    second_term = 1/2.0*1/(n+Zb)**2*((n + Zb - 1)*sum(ep**2 for ep in eps) -
                                     sum(epi*epj for epi,epj in choose2(eps))) * (terms >= 2)
    print zeroth_term,first_term,second_term
    # first_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps)))
    # second_order = -sum(eps) - n*(log(n+Zb) + (-1/(n+Zb)*sum(eps)) + 1/2.0*1/(n+Zb)**2*((n)))
    return -sum(eps) - n*(zeroth_term + first_term + second_term)
Пример #25
0
def interpret_chain(chain, motif, filename=None):
    N = len(motif)
    log_fhats = [log_fhat(theta,motif) for theta in chain]
    log_Zs = [log_ZM_hack(theta,N) for theta in chain]
    log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)]
    plt.plot(map(logmod, [mean(score_seq(x[0],site) for site in motif) for x in chain]),
             label="Mean Site Energy (kBT)")
    plt.plot(map(logmod, [x[1] for x in chain]),label="$\mu$ (kBT)")
    plt.plot(map(logmod, [x[2] for x in chain]),label="$Ne$")
    plt.plot(map(logmod, log_fhats),label="log fhat")
    plt.plot(map(logmod, log_Zs),label="log_ZM")
    plt.plot(map(logmod, log_ps),label="log p")
    plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]),label="Mean Occupancy")
    plt.legend(loc='right',fontsize='large')
    plt.xlabel("Iteration",fontsize='large')
    maybesave(filename)
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10*N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    epsilon = (1+double_sigma)*sigma # 15 Jan 2016
    print "sigma:", sigma
    #bio_ic = motif_ic(motif)
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat, site) for site in motif]
    mu = gle_approx_mu(mat, copies)
    bio_occ = mean([1/(1+exp(ep-mu)) for ep in eps])
    def f(Ne):
        return expected_occupancy(epsilon, Ne, L, copies) - bio_occ
    Ne = log_regress_spec2(f,[1,10],tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
Пример #27
0
def sample_uniform_energy(matrix):
    mu = sum(map(mean, matrix))
    sigma = sqrt(sum(map(lambda x:variance(x,correct=False), matrix)))
    ep_min = sum(map(min, matrix))
    ep_max = sum(map(max, matrix))
    M_min = 1/norm.pdf(ep_min, mu, sigma)
    M_max = 1/norm.pdf(ep_max, mu, sigma)
    M = max(M_min, M_max)
    trials = 0
    while True:
        trials += 1
        if trials % 10000 == 0:
            print trials
        site = random_site(L)
        ep = score_seq(matrix, site)
        ar = 1/(M*norm.pdf(ep, mu, sigma))
        if random.random() < ar:
            return site
Пример #28
0
def sample_uniform_energy(matrix):
    mu = sum(map(mean, matrix))
    sigma = sqrt(sum(map(lambda x: variance(x, correct=False), matrix)))
    ep_min = sum(map(min, matrix))
    ep_max = sum(map(max, matrix))
    M_min = 1 / norm.pdf(ep_min, mu, sigma)
    M_max = 1 / norm.pdf(ep_max, mu, sigma)
    M = max(M_min, M_max)
    trials = 0
    while True:
        trials += 1
        if trials % 10000 == 0:
            print trials
        site = random_site(L)
        ep = score_seq(matrix, site)
        ar = 1 / (M * norm.pdf(ep, mu, sigma))
        if random.random() < ar:
            return site
Пример #29
0
def posterior_chain(motif,
                    iterations=50000,
                    theta0=None,
                    sigma=1,
                    num_spoof_sites='N',
                    verbose=False):
    """do MH with doubly intractable MCMC one-point estimator"""
    L = len(motif[0])
    N = len(motif)
    if num_spoof_sites == 'N':
        num_spoof_sites = N  # should this be N or 1?
    if theta0 is None:
        matrix0 = [[0, 0, 0, 0] for i in range(L)]
        mu0 = -10
        Ne0 = 3
        theta = (matrix0, mu0, Ne0)
    else:
        theta = theta0
    log_f_theta = log_fhat(theta, motif)
    chain = []
    acceptances = 0
    for it in trange(iterations):
        theta_p = prop2(theta, sigma)
        log_f_theta_p = log_fhat(theta_p, motif)
        matrix_p, mu_p, Ne_p = theta_p
        xp = sample_motif_cftp(matrix_p, mu_p, Ne_p, num_spoof_sites)
        log_Z = log_fhat(theta, xp)
        log_Z_p = log_fhat(theta_p, xp)
        log_ar = log_f_theta_p - log_f_theta + N / num_spoof_sites * (log_Z -
                                                                      log_Z_p)
        if log(random.random()) < log_ar:
            theta = theta_p
            log_f_theta = log_f_theta_p
            log_Z = log_Z_p
            acceptances += 1
        chain.append(theta)
        if verbose:
            print "log(f), log_Z:", log_f_theta, log_Z
            print "mean_ep:", mean(score_seq(theta[0], site) for site in motif)
            print "mean_occ:", mean(occs(theta, motif))
            print "mu, Ne:", theta[1], theta[2]
    print "acceptances:", acceptances / float(it + 1)
    return chain
Пример #30
0
def interpret_chain(chain, motif, filename=None):
    N = len(motif)
    log_fhats = [log_fhat(theta, motif) for theta in chain]
    log_Zs = [log_ZM_hack(theta, N) for theta in chain]
    log_ps = [lf - log_Z for (lf, log_Z) in zip(log_fhats, log_Zs)]
    plt.plot(
        map(logmod,
            [mean(score_seq(x[0], site) for site in motif) for x in chain]),
        label="Mean Site Energy (kBT)")
    plt.plot(map(logmod, [x[1] for x in chain]), label="$\mu$ (kBT)")
    plt.plot(map(logmod, [x[2] for x in chain]), label="$Ne$")
    plt.plot(map(logmod, log_fhats), label="log fhat")
    plt.plot(map(logmod, log_Zs), label="log_ZM")
    plt.plot(map(logmod, log_ps), label="log p")
    plt.plot(map(logmod, [mean(occs(x, motif)) for x in chain]),
             label="Mean Occupancy")
    plt.legend(loc='right', fontsize='large')
    plt.xlabel("Iteration", fontsize='large')
    maybesave(filename)
Пример #31
0
def sample_site_imh(matrix, mu, Ne, lamb, iterations=None):
    nu = Ne - 1
    L = len(matrix)
    if iterations is None:
        iterations = 10 * L
    log_phat = lambda site: -nu * log(1 + exp(score_seq(matrix, site) - mu))
    tilted_psfm = psfm_from_matrix(matrix, lamb=lamb)
    log_tilted_psfm = [map(log, row) for row in tilted_psfm]

    def prop(_):
        return sample_from_psfm(tilted_psfm)

    def log_dprop(xp, _):
        return score_seq(log_tilted_psfm, xp)

    return mh(log_phat,
              proposal=prop,
              dprop=log_dprop,
              x0=prop(None),
              use_log=True)[-1]
def make_clusters_with_k(motif, k):
    print "k:", k
    L = len(motif[0])
    N = float(len(motif))
    clusters = [[] for i in range(k)]
    print "len clusters:", len(clusters)
    for site in motif:
        i = random.randrange(k)
        clusters[i].append(site)
    print "finished initializing"
    pssms = [
        mmap(log, psfm_from_motif_(cluster, L, pc=1)) for cluster in clusters
    ]
    alphas = [len(cluster) / N for cluster in clusters]

    def log_likelihood():
        return sum(
            log(
                sum(alpha * exp(score_seq(pssm, site))
                    for alpha, pssm in zip(alphas, pssms))) for site in motif)

    last_ll = 0
    done_yet = False
    #for i in range(iterations):
    while not done_yet:
        cur_ll = log_likelihood()
        print "log likelihood:", cur_ll
        if last_ll == cur_ll:
            done_yet = True
            break
        else:
            last_ll = cur_ll
        clusters = [[] for i in range(k)]
        for site in motif:
            i = argmax([score_seq(pssm, site) for pssm in pssms])
            clusters[i].append(site)
        pssms = [
            mmap(log, psfm_from_motif_(cluster, L, pc=1))
            for cluster in clusters
        ]
    return clusters, log_likelihood()
Пример #33
0
def posterior_chain(motif, iterations=50000, theta0=None, sigma=1, num_spoof_sites='N', verbose=False):
    """do MH with doubly intractable MCMC one-point estimator"""
    L = len(motif[0])
    N = len(motif)
    if num_spoof_sites == 'N':
        num_spoof_sites = N  # should this be N or 1?
    if theta0 is None:
        matrix0 = [[0,0,0,0] for i in range(L)]
        mu0 = -10
        Ne0 = 3
        theta = (matrix0, mu0, Ne0)
    else:
        theta = theta0
    log_f_theta = log_fhat(theta, motif)
    chain = []
    acceptances = 0
    for it in trange(iterations):
        theta_p = prop2(theta, sigma)
        log_f_theta_p = log_fhat(theta_p, motif)
        matrix_p, mu_p, Ne_p = theta_p
        xp = sample_motif_cftp(matrix_p, mu_p, Ne_p, num_spoof_sites)
        log_Z = log_fhat(theta, xp)
        log_Z_p = log_fhat(theta_p, xp)
        log_ar = log_f_theta_p - log_f_theta + N/num_spoof_sites * (log_Z - log_Z_p)
        if log(random.random()) < log_ar:
            theta = theta_p
            log_f_theta = log_f_theta_p
            log_Z = log_Z_p
            acceptances += 1
        chain.append(theta)
        if verbose:
            print "log(f), log_Z:", log_f_theta, log_Z
            print "mean_ep:", mean(score_seq(theta[0],site) for site in motif)
            print "mean_occ:", mean(occs(theta, motif))
            print "mu, Ne:", theta[1], theta[2]
    print "acceptances:", acceptances/float(it+1)
    return chain
def spoof_motifs_occ(motif,
                     num_motifs=10,
                     trials=1,
                     sigma=None,
                     Ne_tol=10**-4,
                     double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10 * N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1))
    epsilon = (1 + double_sigma) * sigma  # 15 Jan 2016
    print "sigma:", sigma
    #bio_ic = motif_ic(motif)
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat, site) for site in motif]
    mu = gle_approx_mu(mat, copies)
    bio_occ = mean([1 / (1 + exp(ep - mu)) for ep in eps])

    def f(Ne):
        return expected_occupancy(epsilon, Ne, L, copies) - bio_occ

    Ne = log_regress_spec2(f, [1, 10], tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
Пример #35
0
from pwm_utils import sigma_from_matrix
from math import log, exp, sqrt
import random
from evo_sampling import sample_motif_cftp
from tqdm import *
from matplotlib import pyplot as plt
from scipy.stats import norm
import numpy as np
from scipy import integrate
from formosa import spoof_maxent_motifs
from adjacent_pairwise_model import code_from_motif, sample_site

def log_fhat((matrix, mu, Ne), motif):
    assert type(motif) is list
    nu = Ne - 1
    eps = [score_seq(matrix, site) for site in motif]
    return -sum(nu*log(1+exp(ep-mu)) for ep in eps)

def log_ZS_analytic((matrix, mu, Ne)):
    """compute log_Z analytically"""
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1/(1+exp(ep-mu)))**(Ne-1)
    return log(acc)

def log_ZM_analytic((matrix, mu, Ne), N):
    log_ZS = log_ZS_analytic((matrix, mu, Ne))
    return N * log_ZS
 def linear_occ(pssm, site):
     ep = score_seq(pssm, site)
     return 1/(1+exp(ep-mu))
def fitness_additive(matrix,motif,G):
    eps = [score_seq(matrix,site) for site in motif]
    fg = sum(exp(-ep) for ep in eps)
    Zb = Zb_from_matrix(matrix,G)
    return fg/(fg + Zb)
Пример #38
0
def eps_from_theta(theta, L, N=100):
    matrix = sample_matrix(L, sigma)
    motif = sample_motif_cftp(matrix, mu, Ne, N)
    eps = [score_seq(matrix, site) for site in motif]
    return eps
Пример #39
0
 def log_dprop(xp, _):
     return score_seq(log_tilted_psfm, xp)
def Zb_from_matrix_ref(matrix,G):
    L = len(matrix)
    eps = np.array([score_seq(matrix,random_site(L)) for i in trange(G)])
    return np.sum(np.exp(-eps))
 def phat(site):
     ep = score_seq(matix, site)
     return 1/(1+exp(ep-mu))**(Ne-1)
Пример #42
0

def random_genotype(n, L, linear_sigma, pairwise_sigma, copies):
    motif = random_motif(L, n)
    pwm = sample_matrix(L, linear_sigma)
    pairwise_weights = [[[random.gauss(0, pairwise_sigma) for i in range(4)]
                         for j in range(4)] for k in range(L - 1)]
    return motif, copies, (pwm, pairwise_weights)


def btoi(b):
    return "ACGT".index(b)


def energy_score((pwm, pairwise_weights), seq):
    linear_score = score_seq(pwm, seq)
    pairwise_score = sum(weight[btoi(b1)][btoi(b2)]
                         for weight, (b1,
                                      b2) in zip(pairwise_weights, pairs(seq)))
    return linear_score + pairwise_score


def compute_Zb(G, (linear_weights, pairwise_weights)):
    pure_pairwise_weights = [[
        [pw[i][j] + lwi[i] + lwj[j] for j in range(4)] for i in range(4)
    ] for pw, (lwi, lwj) in zip(pairwise_weights, pairs(linear_weights))]
    Ws = [
        np.matrix([[exp(w[btoi(b1)][btoi(b2)]) for b2 in "ACGT"]
                   for b1 in "ACGT"]) for w in pure_pairwise_weights
    ]
    return np.array([1, 1, 1, 1]).dot(reduce(lambda x, y: x.dot(y),
 def phat(s):
     assert len(s) == L
     ep = score_seq(matrix,s)
     return (1 + exp(ep - mu))**(-nu)
Пример #44
0
 def linear_phat(site):
     ep = score_seq(pssm, site)
     return 1 / (1 + exp(ep - mu))**(Ne - 1)
 def log_phat(s):
     ep = score_seq(matrix,s)
     nu = Ne - 1
     return -nu*log(1 + exp(ep - mu))
 def log_fit(site):
     return -nu*log(1+exp(score_seq(matrix,site)-mu))
Пример #47
0
 def f(site):
     ep = score_seq(matrix, site)
     return phat(ep)
 def linear_phat(pssm, site):
     ep = score_seq(pssm, site)
     return 1/(1+exp(ep-mu))**(Ne-1)
Пример #49
0
def fitness(site, matrix, mu, Ne):
    ep = score_seq(matrix, site)
    return (1/(1+exp(ep-mu)))**(Ne-1)
def log_fitness_approx2(matrix,motif,G):
    """approximate fitness by neglecting competition from other functional sites, i.e. Zb"""
    eps = [score_seq(matrix,site) for site in motif]
    Zb = Zb_from_matrix(matrix,G)
    return -sum(eps) - n*log(Zb)
def sample_Zb_terms(L,sigma,trials=10000):
    matrix = sample_matrix(L,sigma)
    return [score_seq(matrix,random_site(L)) for i in xrange(trials)]
 def phat(s):
     ep = score_seq(matrix,s)
     return (1 + exp(ep - mu))**(-nu)
 def log_f(site):
     ep = score_seq(matrix, site)
     return -nu*log(1+exp(ep-mu))