def cluster(xs): """given scalars xs, perform 2-component Gaussian clustering via EM""" mu0 = min(xs) mu1 = max(xs) sigma0 = 1 sigma1 = 1 for i in range(10): probs0 = [dnorm(x, mu0, sigma0) for x in xs] probs1 = [dnorm(x, mu1, sigma1) for x in xs] assignments = [ int(prob1 > prob0) for prob0, prob1 in zip(probs0, probs1) ] xs0 = [x for (x, a) in zip(xs, assignments) if a == 0] xs1 = [x for (x, a) in zip(xs, assignments) if a == 1] mu0, sigma0 = mean(xs0), sd(xs0, correct=False) mu1, sigma1 = mean(xs1), sd(xs1, correct=False) if sigma0 == 0: sigma0 = sigma1 if sigma1 == 0: sigma1 = sigma0 print "mu0: {} sigma0: {} mu1: {}: sigma1: {} xs0: {} xs1: {}".format( mu0, sigma0, mu1, sigma1, len(xs0), len(xs1)) def f(x): return dnorm(x, mu1, sigma1) / (dnorm(x, mu0, sigma0) + dnorm(x, mu1, sigma1)) return f
def marginal(i, j): red_matrix = [row for jp, row in enumerate(matrix) if not j == jp] red_site_mu = site_mu_from_matrix(red_matrix) red_site_sigma = site_sigma_from_matrix(red_matrix) ep = matrix[i][j] nom = integrate.quad(lambda ep_rest:f(ep + ep_rest)*dnorm(ep_rest, red_site_mu, red_site_sigma), ep_min, ep_max) denom = integrate.quad(lambda ep_rest:f(ep_rest)*dnorm(ep_rest, site_mu, site_sigma), ep_min, ep_max)
def test_dphidsigma(): x = random.random() mu = random.random() sigma = random.random() pred = dphidsigma(x, mu, sigma) obs = diff(lambda sigma: dnorm(x, mu, sigma), sigma, 10**-10) return pred, obs
def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min,matrix)), sum(map(max,matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) * dnorm(ep,0,site_sigma)*(ep_min <= ep <= ep_max) d_density = lambda ep:ep/site_sigma**2 + nu/(1+exp(mu-ep)) phat = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row)]) lamb = bisect_interval(lambda l:mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log,row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep)/pmode: motif.append(site) return motif
def test_dphidmu(): x = random.random() mu = random.random() sigma = random.random() pred = dphidmu(x, mu, sigma) obs = diff(lambda mu: dnorm(x, mu, sigma), mu, 10**-10) return pred, obs
def sample_motif_ar_tilted(matrix, mu, Ne, N): nu = Ne - 1 L = len(matrix) ep_min, ep_max, L = sum(map(min, matrix)), sum(map(max, matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) * dnorm( ep, 0, site_sigma) * (ep_min <= ep <= ep_max) d_density = lambda ep: ep / site_sigma**2 + nu / (1 + exp(mu - ep)) phat = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min + 1 # don't want mode right on the nose of ep_min for sampling purposes, so offset it a bit dmode = density(mode) # calculate mean epsilon via rejection sampling motif = [] def mean_ep(lamb): psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row) ]) lamb = bisect_interval(lambda l: mean_ep(l) - mode, -20, 20) tilted_psfm = psfm_from_matrix(matrix, lamb=lamb) log_tilted_psfm = [map(log, row) for row in tilted_psfm] while len(motif) < N: site = random_site(L) ep = score_seq(matrix, site) if random.random() < phat(ep) / pmode: motif.append(site) return motif
def log_ZS_sophisticated((matrix, mu, Ne)): L = len(matrix) nu = Ne - 1 mat_mu = sum(map(mean,matrix)) mat_sigma = sqrt(sum(map(lambda xs:variance(xs,correct=False), matrix))) dfde = lambda ep: -nu*exp(ep-mu)/(1+exp(ep-mu)) - (ep-mat_mu)/mat_sigma**2 ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) try: mode = secant_interval(dfde,ep_min - 20, ep_max + 20) except: print (matrix, mu, Ne) raise Exception kappa = -nu*(exp(mu-mode)/(1+exp(mu-mode))**2) - 1/mat_sigma**2 sigma_approx = sqrt(-1/kappa) integrand = lambda ep:dnorm(ep, mat_mu, mat_sigma) * (1+exp(ep-mu))**-nu gauss_max = dnorm(mode, mode, sigma_approx) integrand_max = integrand(mode) mean_ZS = integrand_max / gauss_max return L * log(4) + log(mean_ZS)
def log_ZS_sophisticated((matrix, mu, Ne)): L = len(matrix) nu = Ne - 1 mat_mu = sum(map(mean, matrix)) mat_sigma = sqrt(sum(map(lambda xs: variance(xs, correct=False), matrix))) dfde = lambda ep: -nu * exp(ep - mu) / (1 + exp(ep - mu)) - ( ep - mat_mu) / mat_sigma**2 ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) try: mode = secant_interval(dfde, ep_min - 20, ep_max + 20) except: print(matrix, mu, Ne) raise Exception kappa = -nu * (exp(mu - mode) / (1 + exp(mu - mode))**2) - 1 / mat_sigma**2 sigma_approx = sqrt(-1 / kappa) integrand = lambda ep: dnorm(ep, mat_mu, mat_sigma) * (1 + exp(ep - mu) )**-nu gauss_max = dnorm(mode, mode, sigma_approx) integrand_max = integrand(mode) mean_ZS = integrand_max / gauss_max return L * log(4) + log(mean_ZS)
def predict_ic(matrix, mu, Ne, N=100): nu = Ne - 1 ep_min, ep_max, L = sum(map(min, matrix)), sum(map(max, matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep: (1 / (1 + exp(ep - mu)))**(Ne - 1) * dnorm( ep, 0, site_sigma) * (ep_min <= ep <= ep_max) d_density = lambda ep: ep / site_sigma**2 + nu / (1 + exp(mu - ep)) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min dmode = density(mode) # calculate mean epsilon via rejection sampling eps = [] while len(eps) < N: ep = random.random() * (ep_max - ep_min) + ep_min if random.random() < density(ep) / dmode: eps.append(ep) #return eps des_mean_ep = mean(eps) des_mean_ep_analytic = integrate.quad(lambda ep: ep * density(ep), ep_min, ep_max) # print "des_means:", des_mean_ep, des_mean_ep_analytic # print "min ep: %s max_ep: %s des_mean_ep: %s" % (ep_min, ep_max, des_mean_ep) def mean_ep(lamb): try: psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row) ]) except: print matrix, lamb raise Exception try: lamb = bisect_interval(lambda l: mean_ep(l) - des_mean_ep, -20, 20) except: print matrix, mu, Ne raise Exception tilted_psfm = psfm_from_matrix(matrix, lamb) return sum([2 - h(col) for col in tilted_psfm])
def predict_ic(matrix, mu, Ne, N=100): nu = Ne - 1 ep_min, ep_max, L = sum(map(min,matrix)), sum(map(max,matrix)), len(matrix) site_sigma = site_sigma_from_matrix(matrix) density = lambda ep:(1/(1+exp(ep-mu)))**(Ne-1) * dnorm(ep,0,site_sigma)*(ep_min <= ep <= ep_max) d_density = lambda ep:ep/site_sigma**2 + nu/(1+exp(mu-ep)) mode = bisect_interval(d_density, -100, 100) if mode < ep_min: mode = ep_min dmode = density(mode) # calculate mean epsilon via rejection sampling eps = [] while len(eps) < N: ep = random.random() * (ep_max - ep_min) + ep_min if random.random() < density(ep)/dmode: eps.append(ep) #return eps des_mean_ep = mean(eps) des_mean_ep_analytic = integrate.quad(lambda ep:ep*density(ep), ep_min, ep_max) # print "des_means:", des_mean_ep, des_mean_ep_analytic # print "min ep: %s max_ep: %s des_mean_ep: %s" % (ep_min, ep_max, des_mean_ep) def mean_ep(lamb): try: psfm = psfm_from_matrix(matrix, lamb=lamb) return sum([ep * p for (mat_row, psfm_row) in zip(matrix, psfm) for (ep, p) in zip(mat_row, psfm_row)]) except: print matrix, lamb raise Exception try: lamb = bisect_interval(lambda l:mean_ep(l) - des_mean_ep, -20, 20) except: print matrix, mu, Ne raise Exception tilted_psfm = psfm_from_matrix(matrix, lamb) return sum([2 - h(col) for col in tilted_psfm])
def interp(xstar): numer = sum(y * dnorm(xstar, x, sigma) for x, y in zip(xs, ys)) denom = sum(dnorm(xstar, x, sigma) for x in xs) return numer / denom
def dphidmu(x, mu, sigma): return (x - mu) / float(sigma**2) * dnorm(x, mu, sigma)
def dphidsigma(x, mu, sigma): return dnorm(x, mu, sigma) * ((x - mu)**2 / (sigma**3) - 1 / sigma**2)
if random.random() < 0.5: # flip a coin and update weight matrix or mu altered_col = random.randrange(w) # pick a column to alter altered_row = random.randrange(4) # pick a row to alter dw = random.gauss(0,MAT_SIGMA) # add N(0,2) noise new_mat[altered_col][altered_row] += dw new_fwd_eps,new_rev_eps = update_scores_np(fwd_eps,rev_eps,altered_col,altered_row,dw,w,genome) else: new_mu += random.gauss(0,MU_SIGMA) new_fwd_eps,new_rev_eps = fwd_eps,rev_eps # careful about returning copy...? return ((new_mat,new_mu),(new_fwd_eps,new_rev_eps)) def log_dprop(((matp,mup),epsp),((mat,mu),eps)): dmat = sum([xp - x for (rowp,row) in zip(matp,mat) for (xp,x) in zip(rowp,row)]) dmu = mup - mu if dmat != 0: return log(1/2.0 * dnorm(dmat,0,MAT_SIGMA)) else: return log(1/2.0 * dnorm(dmu,0,MAT_SIGMA)) #return log(dnorm(dmat,0,MAT_SIGMA)) + log(dnorm(dmu,0,MU_SIGMA)) def capture_state((mat_and_mu,site_scores)): return mat_and_mu def complete_log_likelihood(((matrix,mu),eps),mapped_reads,num_cells=NUM_CELLS_RECOVERED): """Compute log likelihood of matrix, given chip seq data""" print "entering complete log likelihood" ps = np.append(fd_solve_np(eps,mu),[0]*(w-1)) G = len(ps) #print "G=",G # if random.random() < 1:#0.01: # pprint(matrix)
def dlgn(x, mu, sigma): """return density of 1/(1+exp(N(mu,sigma**2)))""" return dnorm(log(1 / x - 1), mu, sigma) * 1 / (x * (1 - x))
def Pe(ep, site_mu, site_sigma, mu, Ne): nu = Ne - 1 Z = norm.cdf(mu - log(nu), site_mu, site_sigma) return 1 / Z * (1 / (1 + exp(ep - mu))**nu) * dnorm(ep, site_mu, site_sigma)
def f(xp): return mean(dnorm(xp, mu=x, sigma=sigma) for x in xs)
def f(x): return dnorm(x, mu1, sigma1) / (dnorm(x, mu0, sigma0) + dnorm(x, mu1, sigma1))
ep = score_seq(matrix, site) ar = 1 / (M * norm.pdf(ep, mu, sigma)) if random.random() < ar: return site def log_ZS_gaussian((matrix, mu, Ne), trials=1000, integration='quad'): nu = Ne - 1 L = len(matrix) mat_mu = sum(map(mean, matrix)) mat_sigma = sqrt(sum(map(lambda x: variance(x, correct=False), matrix))) ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) p = lambda x: norm.pdf(x, mat_mu, mat_sigma) f = lambda x: (1 + exp(x - mu))**-nu integrand = lambda ep: dnorm(ep, mat_mu, mat_sigma) * (1 + exp(ep - mu) )**-nu log_integrand = lambda ep: log(dnorm(ep, mat_mu, mat_sigma)) + -nu * log( 1 + exp(ep - mu)) if integration == 'quad': try: mean_ZS, err = integrate.quad(integrand, ep_min, ep_max, epsabs=10**-15) except: print(matrix, mue, Ne) raise Exception elif integration == 'mc': mean_ZS = mean( f(random.gauss(mat_mu, mat_sigma)) for _ in xrange(trials))
def P(ep, mu, alpha): return (1 / (1 + exp(ep - mu)) * exp(-alpha * mu))**nu * dnorm( ep, site_mu, site_sigma)
def log_prior((matrix, mu, Ne)): log_matrix_prior = sum( [log(dnorm(ep, 0, 1)) for row in matrix for ep in row]) log_mu_prior = log(dnorm(mu, 0, 10)) log_Ne_prior = log(exp(-Ne)) return log_matrix_prior + log_mu_prior + log_Ne_prior
site = random_site(L) ep = score_seq(matrix, site) ar = 1/(M*norm.pdf(ep, mu, sigma)) if random.random() < ar: return site def log_ZS_gaussian((matrix, mu, Ne), trials=1000, integration='quad'): nu = Ne - 1 L = len(matrix) mat_mu = sum(map(mean, matrix)) mat_sigma = sqrt(sum(map(lambda x:variance(x,correct=False), matrix))) ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) p = lambda x:norm.pdf(x, mat_mu, mat_sigma) f = lambda x: (1+exp(x-mu))**-nu integrand = lambda ep:dnorm(ep, mat_mu, mat_sigma) * (1+exp(ep-mu))**-nu log_integrand = lambda ep:log(dnorm(ep, mat_mu, mat_sigma)) + -nu*log(1+exp(ep-mu)) if integration == 'quad': try: mean_ZS, err = integrate.quad(integrand, ep_min, ep_max,epsabs=10**-15) except: print (matrix, mue, Ne) raise Exception elif integration == 'mc': mean_ZS = mean(f(random.gauss(mat_mu, mat_sigma)) for _ in xrange(trials)) elif integration == 'uniform': dx = (ep_max - ep_min)/trials mean_ZS = sum([p(x)*f(x) for x in np.linspace(ep_min, ep_max,trials)]) * dx elif integration == 'hack': mean_ZS = norm.cdf(mu - log(nu), mat_mu, mat_sigma) else:
def log_prior((matrix, mu, Ne)): log_matrix_prior = sum([log(dnorm(ep,0,1)) for row in matrix for ep in row]) log_mu_prior = log(dnorm(mu,0,10)) log_Ne_prior = log(exp(-Ne)) return log_matrix_prior + log_mu_prior + log_Ne_prior
def dvar(x, mu, sigma): return dnorm(log(exp(x) - 1), mu, sigma) * exp(x) / (exp(x) - 1)