def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min,matrix)), sum(map(max,matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) * dnorm(ep,0,site_sigma)*(ep_min <= ep <= ep_max) d_density = lambda ep:ep/site_sigma**2 + nu/(1+exp(mu-ep)) phat = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row)]) lamb = bisect_interval(lambda l:mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log,row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep)/pmode: motif.append(site) return motif
def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min, matrix)), sum(map(max, matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) * dnorm( ep, 0, site_sigma) * (ep_min <= ep <= ep_max) d_density = lambda ep: ep / site_sigma**2 + nu / (1 + exp(mu - ep)) phat = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row) ]) lamb = bisect_interval(lambda l: mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log, row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep) / pmode: motif.append(site) return motif
def main(G=5000000,iterations=50000,init_matrix=None,init_mu=None,verbose=True): """Test case for FD-inference""" print "generating genome" genome = random_site(G) print "generating eps" eps = score_genome_np(TRUE_ENERGY_MATRIX,genome) min_mu,max_mu = -40,0 mu = bisect_interval(lambda mu:np.sum(fd_solve_np(eps,mu))-q,min_mu,max_mu,verbose=True,tolerance=1e-1) print "computing ps" true_ps = fd_solve_np(eps,mu) print "true q:",np.sum(true_ps) print "generating chip dataset" mapped_reads = np.array(map_reads_np(chip_ps_np(true_ps,MEAN_FRAGMENT_LENGTH,NUM_CELLS_ORIGINAL),G)) print "finished chip dataset" if init_matrix is None: init_matrix = random_energy_matrix(w) if init_mu is None: init_mu = -20#random.random()*40 - 20 init_scores = score_genome_np(init_matrix,genome) init_state = ((init_matrix,init_mu),init_scores) logf = lambda state:complete_log_likelihood(state,mapped_reads) print "true mu:",mu print "true log_likelihood:",logf(((TRUE_ENERGY_MATRIX,mu),eps)) rprop = lambda state:complete_rprop(state,genome) print "hitting mh loop" matrix_chain = mh(logf,proposal=rprop,x0=init_state,dprop=log_dprop,capture_state=capture_state,verbose=verbose,use_log=True,iterations=iterations,modulus=100) return matrix_chain,genome,mapped_reads
def bisect_interval_noisy(f, epsilon=0.01, sigma=None, debug=False): """find zero of stochastic function f using linear regression""" print "in bisect" xmin = 1 xmax = 2 xs = [xmin, xmax] print xmin, xmax print f(1) ys = map(f, xs) print ys print "ys[-1]:", ys[-1] while ys[-1] < 0: xmax += 1 xs.append(xmax) y = f(xmax) ys.append(y) xs2 = [x + xs[-1] for x in xs] ys2 = map(f, xs2) xs = xs + xs2 ys = ys + ys2 #xs = list(np.linspace(lb,ub,10)) #ys = map(f,xs) print "xs,ys:", xs, ys i = 1 while sd(xs[-3:]) > epsilon: print "starting round", i i += 1 ### select xp # m = (y2-y1)/float(x2-x1) # xp = -y1/m + x1 # yp = f(xp) if sigma is None: print "interpolating on:", xs, ys r = kde_interpolate(xs, ys, sigma=sd(xs) / 3.0) else: r = kde_interpolate(xs, ys, sigma=sigma) try: xp = bisect_interval(r, min(xs), max(xs)) print "selected xp:", xp except: "secant regression failed!" Exception() if debug: plt.scatter(xs, ys) plt.plot(*pl(r, np.linspace(min(xs), max(xs), 1000))) plt.plot([xp, xp], [-10, 10]) plt.plot([min(xs), max(xs)], [0, 0]) plt.show() yp = f(xp) ### end select xp print "xp,yp:", xp, yp xs.append(xp) ys.append(yp) #js = sorted_indices(xs) #xs = rslice(xs,js) #ys = rslice(ys,js) #assert xs == sorted(xs) return xp, (xs, ys)
def predict_ic(matrix, mu, Ne, N=100): nu = Ne - 1 ep_min, ep_max, L = sum(map(min, matrix)), sum(map(max, matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) * dnorm( ep, 0, site_sigma) * (ep_min <= ep <= ep_max) d_density = lambda ep: ep / site_sigma**2 + nu / (1 + exp(mu - ep)) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min dmode = density(mode) # calculate mean epsilon via rejection sampling eps = [] while len(eps) < N: ep = random.random() * (ep_max - ep_min) + ep_min if random.random() < density(ep) / dmode: eps.append(ep) #return eps des_mean_ep = mean(eps) des_mean_ep_analytic = integrate.quad(lambda ep: ep * density(ep), ep_min, ep_max) # print "des_means:", des_mean_ep, des_mean_ep_analytic # print "min ep: %s max_ep: %s des_mean_ep: %s" % (ep_min, ep_max, des_mean_ep) def mean_ep(lamb): try: psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row) ]) except: print matrix, lamb raise Exception try: lamb = bisect_interval(lambda l: mean_ep(l) - des_mean_ep, -20, 20) except: print matrix, mu, Ne raise Exception tilted_psfm = psfm_from_matrix(matrix, lamb) return sum([2 - h(col) for col in tilted_psfm])
def spoof_pmotifs(motif, num_motifs=10, trials=1): n = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) f = lambda p: -mean( motif_ic(pmotif(n, L, p)) - des_ic for i in range(trials)) lb = 0 ub = 0.75 xs = np.linspace(lb, ub, 100) ys = map(f, xs) fhat = kde_regress(xs, ys) p = bisect_interval(fhat, lb, ub, verbose=False, tolerance=10**-3) return [pmotif(n, L, p) or _ in xrange(num_motifs)]
def spoof_motif(motif, T): n = len(motif) L = len(motif[0]) bio_ic = motif_ic(motif) sigma = 2 * mean(map(sd, make_pssm(motif))) # XXX REVSIT THIS ISSUE ic_from_Ne = lambda Ne: predict_stat(n, L, sigma, Ne, G=5 * 10**6, T=lambda rho: mean_ic_from_rho( rho, n, L)) Ne = bisect_interval(lambda Ne: ic_from_Ne(Ne) - bio_ic, 0.01, 5) return predict_stat(n, L, sigma, Ne, T)
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([sum(ep*p for ep,p in zip(eps, ps)) for eps, ps in zip(matrix, psfm)]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f,-20,20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [-nu*log(1+exp(score_seq(matrix, site) - mu)) for site in sites] log_qs = [score_seq(log_psfm, site) for site in sites]
def predict_ic(matrix, mu, Ne, N=100): nu = Ne - 1 ep_min, ep_max, L = sum(map(min,matrix)), sum(map(max,matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) * dnorm(ep,0,site_sigma)*(ep_min <= ep <= ep_max) d_density = lambda ep:ep/site_sigma**2 + nu/(1+exp(mu-ep)) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min dmode = density(mode) # calculate mean epsilon via rejection sampling eps = [] while len(eps) < N: ep = random.random() * (ep_max - ep_min) + ep_min if random.random() < density(ep)/dmode: eps.append(ep) #return eps des_mean_ep = mean(eps) des_mean_ep_analytic = integrate.quad(lambda ep:ep*density(ep), ep_min, ep_max) # print "des_means:", des_mean_ep, des_mean_ep_analytic # print "min ep: %s max_ep: %s des_mean_ep: %s" % (ep_min, ep_max, des_mean_ep) def mean_ep(lamb): try: psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row)]) except: print matrix, lamb raise Exception try: lamb = bisect_interval(lambda l:mean_ep(l) - des_mean_ep, -20, 20) except: print matrix, mu, Ne raise Exception tilted_psfm = psfm_from_matrix(matrix, lamb) return sum([2 - h(col) for col in tilted_psfm])
def metropolis_pb(ks,q,verbose=False,mu_offset=0,iterations=50000): """Metropolis-Hastings sampling for ks, given product-bernoulli proposal function""" G = len(ks) eps = [-log(k) for k in ks] f = lambda mu:sum(fd(ep,mu) for ep in eps) - q mu = bisect_interval(f,-50,50) + mu_offset def weight(ss): return (falling_fac(q,sum(ss))*product(k**s for k,s in zip(ks,ss))) def proposal(ss): #state = [int(random.random() < p) for _ in range(len(ss))] state = rstate(eps,mu) #print "proposed state with occ:",sum(state) return state def dprop(ss): prop = dstate(ss,eps,mu) #print "prop:",prop return prop x0 = proposal([0] * len(ks)) return mh(weight,proposal,x0,dprop=dprop,verbose=verbose,iterations=iterations)
def Ne_from_motif(bio_motif,interp_rounds,iterations=50000): """Given a motif, return Ne that matches mean IC""" bio_ic = motif_ic(bio_motif) n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] print len(matrix) def f(Ne,iterations=iterations): print "Ne",Ne _,chain = sella_hirsch_mh(matrix=matrix,n=n,Ne=Ne,iterations=iterations,init='ringer') return mean(map(motif_ic,chain[iterations/2:])) - bio_ic # lo,hi = 1,5 # data = [] # for _ in xrange(interp_rounds): # guess = (lo + hi)/2.0 # y = f(guess) # print lo,hi,guess,y # data.append((guess,y)) # if y > 0: # hi = guess # else: # lo = guess # return data Ne_min = 1 Ne_max = 5 while f(Ne_max) < 0: print "increasing Ne max" Ne_max *= 2 xs, ys= transpose([(Ne,f(Ne)) for Ne in np.linspace(Ne_min,Ne_max,interp_rounds)]) # now find an interpolant. We desire smallest sigma of gaussian # interpolant such that function has at most one inflection point interp_sigmas = np.linspace(0.01,1,100) interps = [gaussian_interp(xs,ys,sigma=s) for s in interp_sigmas] for i,(sigma, interp) in enumerate(zip(interp_sigmas,interps)): print i,sigma if num_inflection_points(map(interp,np.linspace(Ne_min,Ne_max,100))) == 1: "found 1 inflection point" break print sigma Ne = bisect_interval(interp,Ne_min,Ne_max) return Ne
def metropolis_uniform(ks,q,verbose=False,mu_offset=0,iterations=50000): """Metropolis-Hastings sampling for ks, given uniform proposal function""" G = len(ks) eps = [-log(k) for k in ks] f = lambda mu:sum(fd(ep,mu) for ep in eps) - q mu = bisect_interval(f,-50,50) + mu_offset def weight(ss): return (falling_fac(q,sum(ss))*product(k**s for k,s in zip(ks,ss))) def proposal(ss): on_chr_prob = sum(ss)/float(q) on_chr = random.random() < on_chr_prob ss_new = ss[:] if on_chr: pos = random.choice([i for (i,s) in enumerate(ss) if s]) ss_new[pos] = 0 new_pos = random.choice([-1] + [i for (i,s) in enumerate(ss) if not s]) if new_pos >= 0: ss_new[new_pos] = 1 return ss_new x0 = proposal([0] * len(ks)) return mh(weight,proposal,x0,verbose=verbose,iterations=iterations)
def predict_ic_from_theta(theta, L): sigma, mu, Ne = theta nu = Ne - 1 ep_star = mu - log(Ne - 1) matrix = sample_matrix(L, sigma) ep_min = sum(map(min, matrix)) des_ep = max(ep_star, ep_min + 1) def f(lamb): psfm = psfm_from_matrix(matrix, lamb) return sum([ sum(ep * p for ep, p in zip(eps, ps)) for eps, ps in zip(matrix, psfm) ]) - des_ep log_psfm = [[log(p) for p in ps] for ps in psfm] lamb = bisect_interval(f, -20, 20) sites = ([sample_from_psfm(psfm) for i in range(100)]) log_ps = [ -nu * log(1 + exp(score_seq(matrix, site) - mu)) for site in sites ] log_qs = [score_seq(log_psfm, site) for site in sites]
def solve_mu_for_copy_num(L, sigma, G, copy_num): f = lambda mu: total_occupancy(L, sigma, G, mu) - copy_num return bisect_interval(f, -100, 100)
def find_alpha(K,entropy,tol_factor=0.01): ub = 1/(log2(K)-entropy) #print "K:%s,desired entropy:%s, ub:%s" % (K,entropy,ub) alpha = bisect_interval(lambda alpha:expected_entropy(K,alpha)-entropy,10**-10,ub) return alpha
def find_beta_for_mean_col_ic(n, desired_ic_per_col, tolerance=10**-2): ic_from_beta = lambda beta: 2 - mean_col_ent(n, beta) f = lambda beta: ic_from_beta(beta) - desired_ic_per_col #print "finding beta to tol:",tolerance ub = 100 if n < 100 else 1000 # hackish, upped in order to deal with CRP return bisect_interval(f, -10, ub, verbose=False, tolerance=tolerance)
def mu_from(G,sigma,L,copy_num): f = lambda mu:copy_num_from(G,sigma,L,mu) - copy_num return bisect_interval(f,-500,500)
def mu_from(G, sigma, L, copy_num): f = lambda mu: copy_num_from(G, sigma, L, mu) - copy_num return bisect_interval(f, -500, 500)