def stoch_logistic_reg(f, xmin, xmax, ymax, desired_ic, stop_sd=0.01, debug=False): ymin = 0 def sigmoid(params, x): x0, k, ymax = params y = ymax / (1 + np.exp(-k * (x - x0))) + ymin return y def residuals(params, x, y): return y - sigmoid(params, x) xs = list(np.linspace(xmin, xmax, 10)) ys = map(f, xs) params = (2, 1, desired_ic * 2) while True: p_guess = params params, cov, infodict, mesg, ier = leastsq(residuals, p_guess, args=(xs, ys), full_output=1) try: x = secant_interval(lambda x: sigmoid(params, x) - desired_ic, xmin, xmax) except: print "failed secant interval" print params, xs, ys raise Exception() y = f(x) xs.append(x) ys.append(y) if sd(xs[-3:]) < stop_sd: break if debug: plt.scatter(xs, ys) plt.plot(*pl(lambda x: sigmoid(params, x), np.linspace(xmin, xmax, 1000))) plt.plot(*pl(lambda x: desired_ic, np.linspace(xmin, xmax, 1000))) plt.plot([x, x], [0, 2 * L]) plt.show() # end with one more round of interpolation params, cov, infodict, mesg, ier = leastsq(residuals, p_guess, args=(xs, ys), full_output=1) x = secant_interval(lambda x: sigmoid(params, x) - desired_ic, xmin, xmax) return x, (xs, ys)
def excess_mi_experiment(filename=None): """Do artificial motifs with linear BEMs show the same patterns of excess MI as biological motifs? (Yes)""" n = 10 L = 10 G = 1000 desired_ic = 10 replicates = 1000 ics = np.array( [mean_ic_from_eps(eps, n, L) for eps in enumerate_eps(n, L)]) def mean_ic(N): ps = sella_hirsch_predictions(n, L, G, N) return ics.dot(ps) Ne = secant_interval(lambda N: mean_ic(N) - desired_ic, 0, 2000, tolerance=0.1, verbose=True) # ~= 1525 ps = sella_hirsch_predictions(n, L, G, Ne) sh_sampler = inverse_cdf_sampler(list(enumerate_eps(n, L)), ps) sh_motifs = [ sample_motif_from_mismatches(sh_sampler(), L) for i in trange(replicates) ] sh_mean_ic = mean(map( motif_ic, sh_motifs)) # may undershoot desired due to approximation maxent_motifs = maxent_sample_motifs_with_ic(n, L, sh_mean_ic, replicates) plt.suptitle( "Motif Statistics for Match/Mismatch Model vs. MaxEnt Ensembles (n=10,L=10,G=1000)" ) all_boxplot_comparisons([sh_motifs, maxent_motifs], labels=["M/MM", "MaxEnt"], plot_titles="IC Gini MI".split(), filename=filename)
def find_beta_for_mean_col_ic(n, desired_ic_per_col, tolerance=10**-10, verbose=False): """find beta such that entropy*exp(-beta*entropy)/Z = des_ent""" if verbose: print "enumerating countses" countses = enumerate_counts(n) if verbose: print "enumerating entropies" entropies = np.array(map(entropy_from_counts, countses)) #cols = np.array(map(countses_to_cols, countses)) if verbose: print "enumerating cols" #cols = np.exp(np.array(map(log_counts_to_cols, countses))) iterator = tqdm(countses) if verbose else countses log_cols = np.array(map(log_counts_to_cols, iterator)) def f(beta): phats = cols * (np.exp(-beta * entropies)) return 2 - entropies.dot(phats) / np.sum(phats) - desired_ic_per_col def f2(beta): log_phats = np_log_normalize(log_cols + -beta * entropies) expected_entropy = np.exp(log_phats).dot(entropies) return 2 - expected_entropy - desired_ic_per_col ub = 1000 while f2(ub) < 0: ub *= 2 print "raising upper bound to:", ub return secant_interval(f2, 0, ub, verbose=verbose, tolerance=tolerance)
def find_beta_for_mean_col_ic_ref(n, desired_ic_per_col, tolerance=10**-10): ic_from_beta = lambda beta: 2 - mean_col_ent(n, beta) f = lambda beta: ic_from_beta(beta) - desired_ic_per_col #print "finding beta to tol:",tolerance ub = 1000 # hackish, upped in order to deal with CRP while f(ub) < 0: ub *= 2 return secant_interval(f, -10, ub, verbose=False, tolerance=tolerance)
def find_beta_for_mean_col_ic_ref2(n, desired_ic_per_col, tolerance=10**-10): """find beta such that entropy*exp(-beta*entropy)/Z = des_ent""" counts = enumerate_counts(n) entropies = np.array(map(entropy_from_counts, counts)) cols = np.array(map(counts_to_cols, counts)) def f(beta): phats = cols * (np.exp(-beta * entropies)) return 2 - entropies.dot(phats) / np.sum(phats) - desired_ic_per_col ub = 1000 return secant_interval(f, -10, ub, verbose=False, tolerance=tolerance)
def log_ZS_sophisticated((matrix, mu, Ne)): L = len(matrix) nu = Ne - 1 mat_mu = sum(map(mean,matrix)) mat_sigma = sqrt(sum(map(lambda xs:variance(xs,correct=False), matrix))) dfde = lambda ep: -nu*exp(ep-mu)/(1+exp(ep-mu)) - (ep-mat_mu)/mat_sigma**2 ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) try: mode = secant_interval(dfde,ep_min - 20, ep_max + 20) except: print (matrix, mu, Ne) raise Exception kappa = -nu*(exp(mu-mode)/(1+exp(mu-mode))**2) - 1/mat_sigma**2 sigma_approx = sqrt(-1/kappa) integrand = lambda ep:dnorm(ep, mat_mu, mat_sigma) * (1+exp(ep-mu))**-nu gauss_max = dnorm(mode, mode, sigma_approx) integrand_max = integrand(mode) mean_ZS = integrand_max / gauss_max return L * log(4) + log(mean_ZS)
def log_ZS_sophisticated((matrix, mu, Ne)): L = len(matrix) nu = Ne - 1 mat_mu = sum(map(mean, matrix)) mat_sigma = sqrt(sum(map(lambda xs: variance(xs, correct=False), matrix))) dfde = lambda ep: -nu * exp(ep - mu) / (1 + exp(ep - mu)) - ( ep - mat_mu) / mat_sigma**2 ep_min = sum(map(min, matrix)) ep_max = sum(map(max, matrix)) try: mode = secant_interval(dfde, ep_min - 20, ep_max + 20) except: print(matrix, mu, Ne) raise Exception kappa = -nu * (exp(mu - mode) / (1 + exp(mu - mode))**2) - 1 / mat_sigma**2 sigma_approx = sqrt(-1 / kappa) integrand = lambda ep: dnorm(ep, mat_mu, mat_sigma) * (1 + exp(ep - mu) )**-nu gauss_max = dnorm(mode, mode, sigma_approx) integrand_max = integrand(mode) mean_ZS = integrand_max / gauss_max return L * log(4) + log(mean_ZS)
exp( log_fhat((matrix, mu, Ne), [site]) + log(1.0 / 4**L) - log_psfm_prob(site)) for site in sites) ZS = 4**L * mean_ZS return log(ZS) def log_ZS_importance2((matrix, mu, Ne), trials=1000): y = mu - log(Ne) def expectation(lamb): psfm = [normalize([exp(-lamb * ep) for ep in row]) for row in matrix] return sum(ep * p for row, ps in zip(matrix, psfm) for ep, p in zip(row, ps)) lamb = secant_interval(lambda x: expectation(x) - y, -10, 10) def log_ZM_importance((matrix, mu, Ne), N, trials=1000): log_ZS = log_ZS_importance((matrix, mu, Ne), trials=trials) return N * log_ZS def log_ZS_empirical((matrix, mu, Ne), trials=1000): L = len(matrix) acc = 0 for i in xrange(trials): ep = score_seq(matrix, random_site(L)) acc += 1.0 / (1 + exp(ep - mu))**(Ne - 1) est_mean = acc / trials log_Zs = L * log(4) + log(est_mean)
L = len(matrix) psfm = [[0.25]*4 for _ in range(L)] log_psfm = [[log(p) for p in row] for row in psfm] log_psfm_prob = lambda site:score_seq(log_psfm, site) sites = [sample_from_psfm(psfm) for _ in xrange(trials)] mean_ZS = mean(exp(log_fhat((matrix, mu, Ne), [site]) + log(1.0/4**L) - log_psfm_prob(site)) for site in sites) ZS = 4**L * mean_ZS return log(ZS) def log_ZS_importance2((matrix, mu, Ne), trials=1000): y = mu - log(Ne) def expectation(lamb): psfm = [normalize([exp(-lamb*ep) for ep in row]) for row in matrix] return sum(ep*p for row, ps in zip(matrix, psfm) for ep,p in zip(row,ps)) lamb = secant_interval(lambda x:expectation(x)-y,-10,10) def log_ZM_importance((matrix, mu, Ne), N, trials=1000): log_ZS = log_ZS_importance((matrix, mu, Ne), trials=trials) return N * log_ZS def log_ZS_empirical((matrix, mu, Ne), trials=1000): L = len(matrix) acc = 0 for i in xrange(trials): ep = score_seq(matrix, random_site(L)) acc += 1.0/(1+exp(ep-mu))**(Ne-1) est_mean = acc / trials log_Zs = L*log(4) + log(est_mean) return log_Zs
def predict_modal_energy(site_mu, site_sigma, mu, Ne): nu = Ne - 1 dlogPe_de = lambda ep: -nu * exp(ep - mu) / (1 + exp(ep - mu)) - ( ep - site_mu) / site_sigma**2 return secant_interval(dlogPe_de, -50, 50)