def make_ecoli_df(): Ls = [] Ls_adj = [] ns = [] sigmas = [] labels = [] motif_ics = [] motif_ics_per_base = [] for tf in Escherichia_coli.tfs: sites = getattr(Escherichia_coli, tf) L = len(sites[0]) n = len(sites) ns.append(n) L_adj = len(sites[0]) + log2(n) sigma = mean((map(sd, make_pssm(sites)))) Ls.append(L) Ls_adj.append(L_adj) motif_ics.append(motif_ic(sites)) motif_ics_per_base.append(motif_ic(sites) / float(L)) sigmas.append(sigma) df = pd.DataFrame( { "L": Ls, "n": ns, "sigma": sigmas, "motif_ic": motif_ics, "info_density": motif_ics_per_base }, index=Escherichia_coli.tfs) return df
def match_ic_mi(N, L, des_ic, des_mi, iterations=50000, take_stock=None, eta=0.01, alpha=1, beta=0): if take_stock is None: take_stock = int((N * L) * log(N * L)) x = random_motif(L, N) xs = [None] * iterations ics = [0.0] * iterations mis = [0.0] * iterations alphas = [0.0] * iterations betas = [0.0] * iterations ic = motif_ic(x) mi = total_motif_mi(x) accepts = 0 for i in xrange(iterations): # if i == iterations/2: # eta *= 0.1 xp = mutate_motif(x) icp = motif_ic(xp) mip = total_motif_mi(xp) log_y = (alpha * ic + beta * mi) log_yp = (alpha * icp + beta * mip) if log(random.random()) < log_yp - log_y: accepts += 1 x = xp ic = icp mi = mip ics[i] = (ic) mis[i] = (mi) xs[i] = (x) #print sum(site.count("A") for site in x) alphas[i] = (alpha) betas[i] = (beta) if i > 0 and i % take_stock == 0: if i < iterations / 10: mean_ic = mean(ics[i - take_stock:i]) mean_mi = mean(mis[i - take_stock:i]) alpha += eta * (des_ic - mean_ic) * exp( -i / (10 * float(iterations))) beta += eta * (des_mi - mean_mi) * exp( -i / (10 * float(iterations))) else: mean_ic = mean(ics[i - take_stock:i]) mean_mi = mean(mis[i - take_stock:i]) alpha = poly1d(polyfit(ics[:i], alphas[:i], 1))(des_ic) beta = poly1d(polyfit(mis[:i], betas[:i], 1))(des_mi) fmt_string = ( "mean ic: % 1.2f, mean mi: % 1.2f, alpha: % 1.2f, beta: % 1.2f" % (mean_ic, mean_mi, alpha, beta)) print i, "AR:", accepts / (i + 1.0), fmt_string return xs, ics, mis, alphas, betas
def spoof_pmotifs(motif, num_motifs=10, trials=1): n = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) f = lambda p: -mean( motif_ic(pmotif(n, L, p)) - des_ic for i in range(trials)) lb = 0 ub = 0.75 xs = np.linspace(lb, ub, 100) ys = map(f, xs) fhat = kde_regress(xs, ys) p = bisect_interval(fhat, lb, ub, verbose=False, tolerance=10**-3) return [pmotif(n, L, p) or _ in xrange(num_motifs)]
def flux_prob(motif, a, b, trials=10000): """determine probability of mutation pushing motif out of interval [a,b]""" ic = motif_ic(motif) lesser = 0 same = 0 greater = 0 for i in trange(trials): icp = motif_ic(mutate_motif(motif)) if icp < a: lesser += 1 elif a <= icp < b: same += 1 else: greater += 1 return lesser, same, greater
def spoof_motif_cftp(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-2,verbose=False): n = len(motif) L = len(motif[0]) copies = 10*n if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) print "sigma:", sigma bio_ic = motif_ic(motif) matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, copies=10*n, G=5*10**6) print "mu:", mu def f(Ne): motifs = [sample_motif_cftp(matrix, mu, Ne, n, verbose=verbose) for i in trange(trials)] return mean(map(motif_ic,motifs)) - bio_ic # lb = 1 # ub = 10 # while f(ub) < 0: # ub *= 2 # print ub x0s = [2,10]#(lb + ub)/2.0 # print "choosing starting seed for Ne" # fs = map(lambda x:abs(f(x)),x0s) # print "starting values:",x0s,fs # x0 = x0s[argmin(fs)] # print "chose:",x0 # Ne = bisect_interval_noisy_ref(f,x0,lb=1,verbose=True) Ne = log_regress_spec2(f,x0s,tol=Ne_tol) print "Ne:",Ne return [sample_motif_cftp(matrix, mu, Ne, n) for _ in trange(num_motifs)]
def interpret_main_experiment(results_dict,named_fs=None): if named_fs is None: named_fs = [(fitness,"fitness"),(lambda org:motif_ic(extract_sites(org)),"Motif IC"), (lambda org:total_motif_mi(extract_sites(org)),"Motif MI")] rec_muts,site_muts = map(lambda x:sorted(set(x)),transpose(results_dict.keys())) fs,names = transpose(named_fs) subplot_dimension = ceil(sqrt(len(fs))) for idx,f in enumerate(fs): mat = np.zeros((len(rec_muts),len(site_muts))) for i,rec_mut in enumerate(sorted(rec_muts)): for j,site_mut in enumerate(sorted(site_muts)): pop,hist = results_dict[(rec_mut,site_mut)] mat[i,j] = mean([f(x) for x,fit in pop]) print i,j,mat[i,j] plt.subplot(subplot_dimension,subplot_dimension,idx) plt.imshow(mat,interpolation='none') plt.xticks(range(len(site_muts)),map(str,site_muts)) plt.yticks(range(len(rec_muts)),map(str,rec_muts)) #plt.yticks(rec_muts) plt.xlabel("site mutation rate") plt.ylabel("rec mutation rate") plt.colorbar() title = names[idx] plt.title(title) plt.show()
def Ne_scan(sigma, L, copies, trials=1, n=100, max_Ne=10, Ne_steps=100): Ne_range = np.linspace(1, max_Ne, Ne_steps) sigma = 1 plt.subplot(1, 4, 1) obs_ics = map( lambda Ne: mean( motif_ic(sample_motif(sigma=sigma, Ne=Ne, L=L, n=n, copies=copies)) for _ in range(trials)), Ne_range) pred_ics = map(lambda Ne: expected_ic(sigma, Ne, L, copies), Ne_range) occs = map(lambda Ne: expected_occupancy(sigma, Ne, L, copies), Ne_range) mismatches = map(lambda Ne: mismatch_probability(sigma, Ne, L, copies), Ne_range) plt.plot(Ne_range, obs_ics) plt.plot(Ne_range, pred_ics) plt.plot(Ne_range, occs) plt.plot(Ne_range, mismatches) plt.subplot(1, 4, 2) plt.plot(mismatches, pred_ics) plt.xlabel("Mismatches") plt.ylabel("IC") plt.subplot(1, 4, 3) plt.plot(occs, pred_ics) plt.xlabel("Occupancy") plt.ylabel("IC") plt.subplot(1, 4, 4) plt.plot(mismatches, occs) plt.xlabel("Mismatches") plt.ylabel("Occs")
def sample_motif_cftp_param_study(): """Examine dependence of IC on sigma, Ne""" grid_points = 10 sigmas = np.linspace(0.5,10,grid_points) Nes = np.linspace(1,10,grid_points) trials = 3 n = 20 L = 10 def f(sigma, Ne): matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, 10*n) return motif_ic(sample_motif_cftp(matrix, mu, Ne, n)) ics = [[(mean(f(sigma, Ne) for _ in range(trials))) for sigma in sigmas] for Ne in tqdm(Nes,desc="ic grid")] plt.contourf(sigmas, Nes,ics) plt.colorbar() #bio_motifs = [getattr(Escherichia_coli,tf) for tf in Escherichia_coli.tfs] bio_sigmas = [sigma_from_matrix(pssm_from_motif(motif,pc=1)) for motif in bio_motifs] bio_ics = [motif_ic(motif) for motif in bio_motifs] #griddata((sigmas,Nes),ics) interp = interp2d(sigmas,Nes,ics) bio_Nes = [bisect_interval(lambda Ne:interp(show(bio_sigma),Ne)-bio_ic,0,20) for bio_sigma, bio_ic in zip(bio_sigmas,bio_ics)] plt.scatter(sigm)
def interpret_main_experiment(results_dict, named_fs=None): if named_fs is None: named_fs = [(fitness, "fitness"), (lambda org: motif_ic(extract_sites(org)), "Motif IC"), (lambda org: total_motif_mi(extract_sites(org)), "Motif MI")] rec_muts, site_muts = map(lambda x: sorted(set(x)), transpose(results_dict.keys())) fs, names = transpose(named_fs) subplot_dimension = ceil(sqrt(len(fs))) for idx, f in enumerate(fs): mat = np.zeros((len(rec_muts), len(site_muts))) for i, rec_mut in enumerate(sorted(rec_muts)): for j, site_mut in enumerate(sorted(site_muts)): pop, hist = results_dict[(rec_mut, site_mut)] mat[i, j] = mean([f(x) for x, fit in pop]) print i, j, mat[i, j] plt.subplot(subplot_dimension, subplot_dimension, idx) plt.imshow(mat, interpolation='none') plt.xticks(range(len(site_muts)), map(str, site_muts)) plt.yticks(range(len(rec_muts)), map(str, rec_muts)) #plt.yticks(rec_muts) plt.xlabel("site mutation rate") plt.ylabel("rec mutation rate") plt.colorbar() title = names[idx] plt.title(title) plt.show()
def moran_process(N=1000, turns=10000, init=sample_species, mutate=mutate, fitness=fitness, pop=None): if pop is None: pop = [(lambda spec: (spec, fitness(spec)))(sample_species()) for _ in trange(N)] hist = [] for turn in xrange(turns): fits = [f for (s, f) in pop] #print fits birth_idx = inverse_cdf_sample(range(N), fits, normalized=False) death_idx = random.randrange(N) #print birth_idx,death_idx mother, f = pop[birth_idx] daughter = mutate(mother) #print "mutated" pop[death_idx] = (daughter, fitness(daughter)) mean_fits = mean(fits) hist.append((f, mean_fits)) if turn % 10 == 0: mean_dna_ic = mean( [motif_ic(sites, correct=False) for ((sites, eps), _) in pop]) mean_rec_h = mean( [h_np(boltzmann(eps)) for ((dna, eps), _) in pop]) print turn, "sel_fit:", f, "mean_fit:", mean_fits, "mean_dna_ic:", mean_dna_ic, "mean_rec_h:", mean_rec_h return pop
def main(prok_motifs, euk_motifs, filename='motif_summary_stats.eps'): sns.set(style="darkgrid", color_codes=True) #df = pd.DataFrame(columns="Type N L IC Gini".split(), index=range(len(prok_motifs) + len(euk_motifs))) df = pd.DataFrame() df['Domain'] = ["Eukaryotic" for _ in euk_motifs ] + ["Prokaryotic" for _ in prok_motifs] motifs = euk_motifs + prok_motifs df['N'] = [log(len(motif)) / log(10) for motif in motifs] df['L (bp)'] = [len(motif[0]) for motif in motifs] df['IC (bits)'] = [motif_ic(motif) for motif in motifs] df['IGC'] = [motif_gini(motif) for motif in motifs] pg = sns.pairplot(df, hue='Domain', markers='s o'.split(), palette='cubehelix') #hue_order=["Prokaryotic", "Eukaryotic"]) for i in range(4): pg.axes[i][3].set_xlim(-0.01, 0.6) for j in range(4): pg.axes[3][j].set_ylim(-0.01, 0.6) pg.axes[0][0].set_yticks(range(1, 5)) pg.axes[0][0].set_yticklabels(["$10^%i$" % i for i in range(1, 5)]) pg.axes[3][0].set_xticks(range(1, 5)) pg.axes[3][0].set_xticklabels(["$10^%i$" % i for i in range(1, 5)]) maybesave(filename)
def spoof_motifs(motif, num_motifs=10, trials=1, sigma=None, Ne_tol=10**-4, double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10 * N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1)) epsilon = (1 + double_sigma) * sigma # 15 Jan 2016 print "sigma:", sigma bio_ic = motif_ic(motif) def f(Ne): ps = ps_from_copies(sigma, Ne, L, copies) motifs = [ sample_motif(epsilon, Ne, L, copies, n, ps=ps) for i in range(trials) ] return mean(map(motif_ic, motifs)) - bio_ic Ne = log_regress_spec2(f, [1, 10], tol=10**-3) return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def Ne_scan(sigma,L,copies,trials=1,n=100,max_Ne=10,Ne_steps=100): Ne_range = np.linspace(1,max_Ne,Ne_steps) sigma = 1 plt.subplot(1,4,1) obs_ics = map(lambda Ne:mean(motif_ic(sample_motif(sigma=sigma,Ne=Ne,L=L,n=n,copies=copies)) for _ in range(trials)), Ne_range) pred_ics = map(lambda Ne:expected_ic(sigma,Ne,L,copies),Ne_range) occs = map(lambda Ne:expected_occupancy(sigma,Ne,L,copies),Ne_range) mismatches = map(lambda Ne:mismatch_probability(sigma,Ne,L,copies),Ne_range) plt.plot(Ne_range,obs_ics) plt.plot(Ne_range,pred_ics) plt.plot(Ne_range,occs) plt.plot(Ne_range,mismatches) plt.subplot(1,4,2) plt.plot(mismatches,pred_ics) plt.xlabel("Mismatches") plt.ylabel("IC") plt.subplot(1,4,3) plt.plot(occs,pred_ics) plt.xlabel("Occupancy") plt.ylabel("IC") plt.subplot(1,4,4) plt.plot(mismatches,occs) plt.xlabel("Mismatches") plt.ylabel("Occs")
def sigma_scan(Ne,L,copies,trials=1,n=100,sigma_steps=100,max_sigma=10): sigma_range = np.linspace(1,max_sigma,sigma_steps) sigma = 1 plt.subplot(1,4,1) obs_ics = map(lambda sigma:mean(motif_ic(sample_motif(sigma=sigma,Ne=Ne,L=L,n=n,copies=copies)) for _ in range(trials)), sigma_range) pred_ics = map(lambda sigma:expected_ic(sigma,Ne,L,copies),sigma_range) occs = map(lambda sigma:expected_occupancy(sigma,Ne,L,copies),sigma_range) mismatches = map(lambda sigma:mismatch_probability(sigma,Ne,L,copies),sigma_range) mus = map(lambda sigma:mu_from(G,sigma,L,copies),sigma_range) approx_mus = map(lambda sigma:approx_mu(G,sigma,L,copies),sigma_range) mean_log_Zbs = map(lambda sigma:log(mean_Zb(sigma,L)),sigma_range) plt.plot(sigma_range,obs_ics) plt.plot(sigma_range,pred_ics) plt.plot(sigma_range,occs) plt.plot(sigma_range,mismatches) plt.plot(sigma_range,mus) plt.plot(sigma_range,approx_mus) plt.plot(sigma_range,mean_log_Zbs) plt.subplot(1,4,2) plt.plot(mismatches,pred_ics) plt.xlabel("Mismatches") plt.ylabel("IC") plt.subplot(1,4,3) plt.plot(pred_ics,occs) plt.xlabel("IC") plt.ylabel("Occupancy") plt.subplot(1,4,4) plt.plot(mismatches,occs) plt.xlabel("Mismatches") plt.ylabel("Occs")
def biological_experiment(replicates=1000): delta_ic = 0.1 results_dict = defaultdict(lambda:defaultdict(dict)) for tf_idx,tf in enumerate(Escherichia_coli.tfs): print tf,"(%s/%s)" % (tf_idx,len(Escherichia_coli.tfs)) bio_motif = getattr(Escherichia_coli,tf) n,L = motif_dimensions(bio_motif) bio_ic = motif_ic(bio_motif) bio_gini = motif_gini(bio_motif) bio_mi = total_motif_mi(bio_motif) results_dict[tf]["bio"]["motif_ic"] = bio_ic results_dict[tf]["bio"]["motif_gini"] = bio_gini results_dict[tf]["bio"]["total_motif_mi"] = bio_mi beta = find_beta_for_mean_motif_ic(n,L,bio_ic) maxent = maxent_motifs_with_ic(n,L,bio_ic,replicates) #maxent_truncated = maxent_truncated_sample_motifs_with_ic(n,L,bio_ic,delta_ic,replicates,beta=beta) uniform = uniform_motifs_with_ic(n,L,bio_ic,delta_ic,replicates) #chain_spoofs = chain_sample_motifs_with_ic(n,L,bio_ic,delta_ic,replicates,beta=beta) #for spoof_name in "maxent maxent_truncated envelope".split(): for spoof_name in "maxent uniform".split(): spoofs = eval(spoof_name) for motif_statname in "motif_ic motif_gini total_motif_mi".split(): motif_stat = eval(motif_statname) results_dict[tf][spoof_name][motif_statname] = map(motif_stat,spoofs) #all_spoofs = [maxent_spoofs,maxent_truncated_spoofs,envelope_spoofs]#,chain_spoofs] # print "IC:",bio_ic,map(lambda xs:val_in_coverage(bio_ic,xs),mmap(motif_ic,all_spoofs)) # print "Gini:",bio_gini,map(lambda xs:val_in_coverage(bio_gini,xs),mmap(motif_gini,all_spoofs)) # print "MI:",bio_mi,map(lambda xs:val_in_coverage(bio_mi,xs),mmap(total_motif_mi,all_spoofs)) return results_dict
def plot_results_dict_gini_qq(results_dict,filename=None): bios = [] maxents = [] uniforms = [] for i,k in enumerate(results_dict): g1,g2,tf = k.split("_") genome = g1 + "_" + g2 bio_motif = extract_tfdf_sites(genome,tf) bio_ic = motif_ic(bio_motif) bio_gini = motif_gini(bio_motif) d = results_dict[k] bios.append(bio_gini) maxents.append(mean(d['maxent']['motif_gini'])) uniforms.append(mean(d['uniform']['motif_gini'])) plt.scatter(bios,maxents,label='ME') plt.scatter(bios,uniforms,label='TURS',color='g') minval = min(bios+maxents+uniforms) maxval = max(bios+maxents+uniforms) plt.plot([minval,maxval],[minval,maxval],linestyle='--') plt.xlabel("Observed Gini Coefficient") plt.ylabel("Mean Sampled Gini Coefficient") plt.legend(loc='upper left') print "bio vs maxent:",pearsonr(bios,maxents) print "bio vs uniform:",pearsonr(bios,uniforms) maybesave(filename)
def gini_of_LGEs_experiment(iterations=50000, Ne=200, sigma=1, lge_replicates=20): """Do motifs evolved under LGEs show more or less gini coefficient than IC-matched maxent counterparts?""" n = 16 L = 16 maxent_replicates = 1000 lge_motifs = [] lge_matrices = [] maxent_motifss = [] for i in range(lge_replicates): matrix, chain = sella_hirsch_mh(Ne=Ne, n=n, L=L, G=G, sigma=sigma, iterations=iterations, init='ringer') lge_motif = chain[-1] desired_ic = motif_ic(lge_motif) maxent_motifs = maxent_sample_motifs_with_ic( n, L, desired_ic, replicates=maxent_replicates) lge_matrices.append(matrix) lge_motifs.append(lge_motif) maxent_motifss.append(maxent_motifs) return lge_matrices, lge_motifs, maxent_motifss
def moran_process(N=1000,turns=10000,mean_site_muts=1,mean_rec_muts=1,init=sample_species,mutate=mutate, fitness=fitness,pop=None,print_modulus=100,hist_modulus=10): #ringer = (np.array([1]+[0]*(K-1)),sample_eps()) if pop is None: pop = [(lambda spec:(spec,fitness(spec)))(init()) for _ in trange(N)] # ringer = make_ringer() # pop[0] = (ringer,fitness(ringer)) #pop = [(ringer,fitness(ringer)) for _ in xrange(N)] site_mu = min(1/float(n*L) * mean_site_muts,1) rec_mu = min(1/float(K) * mean_rec_muts,1) hist = [] for turn in xrange(turns): fits = [f for (s,f) in pop] #print fits birth_idx = inverse_cdf_sample(range(N),fits,normalized=False) if birth_idx is None: return pop death_idx = random.randrange(N) #print birth_idx,death_idx mother,f = pop[birth_idx] daughter = mutate(mother,site_mu,rec_mu) #print "mutated" pop[death_idx] = (daughter,fitness(daughter)) mean_fits = mean(fits) #hist.append((f,mean_fits)) if turn % hist_modulus == 0: mean_dna_ic = mean([motif_ic(sites,correct=False) for ((sites,eps),_) in pop]) mean_rec = mean([recognizer_promiscuity(x) for (x,f) in pop]) mean_recced = mean([sites_recognized((dna,rec)) for ((dna,rec),_) in pop]) hist.append((turn,f,mean_fits,mean_dna_ic,mean_rec,mean_recced)) if turn % print_modulus == 0: print turn,"sel_fit:",f,"mean_fit:",mean_fits,"mean_dna_ic:",mean_dna_ic,"mean_rec_prom:",mean_rec return pop,hist
def L_vs_sigma_plot(filename=None, with_bio=False): if with_bio: tfdf = extract_motif_object_from_tfdf() motifs = [getattr(tfdf, tf) for tf in tfdf.tfs] Ls = [len(motif[0]) for motif in motifs] cs = [len(motif) for motif in motifs] ics = [motif_ic(motif) for motif in motifs] ic_density = [ic / L for ic, L in zip(ics, Ls)] sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs] ginis = [motif_gini(motif, correct=False) for motif in motifs] mi_density = [ total_motif_mi(motif) / choose(L, 2) for motif, L in zip(motifs, Ls) ] min_sigma = 0.1 max_sigma = 10 plt.xlim(0, max_sigma) plt.ylim(0, 60) plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)), label="Binding Transition") plt.plot([min_sigma, max_sigma], [log(G, 2) / 2, log(G, 2) / 2], linestyle='--', label="Info Theory Threshold") # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)), # linestyle='--',label="Zero Discrimination Asymptote") if with_bio: plt.scatter(sigmas, Ls, label="Biological Motifs") plt.xlabel("sigma") plt.ylabel("L") plt.legend() maybesave(filename)
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None): N = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) chain = evo_ic_sample_motif2(N, L, des_ic, iterations=iterations, verbose=False, theta=theta) motifs = [sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N) for (sigma, mu, Ne) in tqdm(chain)] return chain, motifs
def train_pairwise_model2(motif, pc=1 / 16.0, decay_timescale=10000, take_stock=1000, eta=0.01, stop_crit=0.01): L = len(motif[0]) N = len(motif) fs = get_pairwise_freqs(motif, pc=pc) ws = [{(b1, b2): 0 for (b1, b2) in dinucs} for _ in range(int(choose(L, 2)))] iteration = 0 while True: cur_motif = [ sample_model(ws, x0=site, iterations=10 * L)[-1] for site in motif ] current_fs = get_pairwise_freqs(cur_motif) sse = 0 for w, f, cur_f in zip(ws, fs, current_fs): for b1, b2 in dinucs: delta = f[b1, b2] - cur_f[b1, b2] sse += delta**2 w[b1, b2] += eta * ( delta) #* exp(-iteration/float(decay_timescale)) #sses[iteration/take_stock] = sse sse_per_col_pair = sse / choose(L, 2) print iteration, sse_per_col_pair, ws[0]['A', 'A'] print "motif_ic:", motif_ic(cur_motif) if iteration > 0 and sse_per_col_pair < stop_crit: print "breaking:", sse, sse_per_col_pair break iteration += 1 return ws
def spoof_motif_ref(motif, num_motifs=10, trials=10, sigma=None, Ne_tol=10**-4): n = len(motif) L = len(motif[0]) copies = 10 * n if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1)) print "sigma:", sigma bio_ic = motif_ic(motif) def f(Ne): ps = ps_from_copies(sigma, Ne, L, copies) motifs = [ sample_motif(sigma, Ne, L, copies, n, ps=ps) for i in range(trials) ] return mean(map(motif_ic, motifs)) - bio_ic lb = 1 ub = 2 while f(ub) < 0: ub *= 2 ub *= 2 # once more for good measure x0 = (lb + ub) / 2.0 print "Ne guess:", x0 Nes = [ bisect_interval_noisy(f, x0=x0, tolerance=Ne_tol, lb=1) for i in range(3) ] Ne = mean(Nes) print "Nes:", Nes, Ne return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def main_experiment(motif_obj): """compare gini biological motifs to: (1) null ic-matched ensembles,and (2) sigma and IC matched evosims. Conduct evosims by matching sigma to pssm (fair?) and sweeping Ne in order to match IC. """ evosim_trials = 10 for tf in motif_obj.tfs: bio_motif = getattr(motif_obj, tf) n, L = len(bio_motif), len(bio_motif[0]) bio_gini = motif_gini(bio_motif) bio_ic = motif_ic(bio_motif) bio_mi = total_motif_mi(bio_motif) ############### ### null ensemble stuff here ############### pssm = make_pssm(bio_motif) sigma = mean(map(sd, pssm)) # revisit this, see Djordjevic's paper # determine Ne Ne_ic = {} lo = 0 hi = 5 #Ne_ic[lo] = chain = sella_hirsch_mh_gr(matrix, Ne=5, iterations=1000, n=16, x0s=None) print "sigma:", sigma for trial in trange(evosim_trials): matrix = sample_matrix(L, sigma)
def moran_process(mean_rec_muts, mean_site_muts, N=1000, turns=10000, init=make_ringer2, mutate=mutate, fitness=fitness, pop=None): site_mu = mean_site_muts / float(n * L) bd_mu = mean_rec_muts / float(L) if pop is None: pop = [(lambda spec: (spec, fitness(spec)))(init()) for _ in trange(N)] hist = [] for turn in xrange(turns): fits = [f for (s, f) in pop] birth_idx = inverse_cdf_sample(range(N), fits, normalized=False) death_idx = random.randrange(N) #print birth_idx,death_idx mother, f = pop[birth_idx] daughter = mutate(mother, site_mu, bd_mu) #print "mutated" pop[death_idx] = (daughter, fitness(daughter)) mean_fits = mean(fits) hist.append((f, mean_fits)) if turn % 1000 == 0: mean_dna_ic = mean( [motif_ic(sites, correct=False) for ((bd, sites), _) in pop]) print turn, "sel_fit:", f, "mean_fit:", mean_fits, "mean_dna_ic:", mean_dna_ic return pop, hist
def spoof_motifs_maxent(motif, num_motifs, verbose=False): n = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) if verbose: print "n: {} L: {} des_ic: {}".format(n, L, des_ic) return maxent_motifs_with_ic(n, L, des_ic, num_motifs, verbose=verbose)
def best_ic_motif(L,n,trials): best_ic = 0 for i in trange(trials): motif = random_motif(L,n) cur_ic = motif_ic(motif,correct=False) if cur_ic > best_ic: best_motif = motif return best_motif
def analyze_bio_motifs(Nes,trials=20): results = {} for tf_idx,tf in enumerate(Escherichia_coli.tfs): Ne = Nes[tf] bio_motif = getattr(Escherichia_coli,tf) n,L = len(bio_motif),len(bio_motif[0]) bio_matrix = matrix_from_motif(bio_motif) sigma = sigma_from_matrix(bio_matrix) matrix_chains = [sella_hirsch_mh(n=n,L=L,sigma=sigma,Ne=Ne,init='ringer') for i in range(trials)] ics = [mean(map(motif_ic,chain[-1000:])) for (matrix,chain) in matrix_chains] ginis = [mean(map(motif_gini,chain[-1000:])) for (matrix,chain) in matrix_chains] mis = [mean(map(total_motif_mi,chain[-1000:])) for (matrix,chain) in matrix_chains] print "results for:",tf,tf_idx print motif_ic(bio_motif),mean(ics),sd(ics) print motif_gini(bio_motif),mean(ginis),sd(ginis) print total_motif_mi(bio_motif),mean(mis),sd(mis) results[tf] = (mean(ics),sd(ics),mean(ginis),sd(ginis),mean(mis),sd(mis)) return results
def recognizer_non_linearity((sites,recognizer)): L = log(len(idx_of_word),4) motif = [w for w,i in idx_of_word.items() if recognizer[i]] if len(motif) == 0: return -1 else: total_info = 2*L - log2(len(motif)) col_info = motif_ic(motif,correct=False) return total_info - col_info
def spoof_motifs_uniform(motif, num_motifs, epsilon=0.1, verbose=False): n, L = len(motif), len(motif[0]) desired_ic = motif_ic(motif) if verbose: print "starting spoof motifs uniform with:", n, L, desired_ic return uniform_motifs_accept_reject(n, L, desired_ic, num_motifs, epsilon, verbose=verbose)
def interpret_main_experiment(results_dict): taus = sorted(results_dict.keys()) print taus data = [(tau,f,motif_ic(extract_sites(s)),total_motif_mi(extract_sites(s))) for tau in taus for (s,f) in results_dict[tau][0]] cols = transpose(data) names = "tau,f,motif_ic,total_motif_mi".split(",") for (i,name1),(j,name2) in choose2(list(enumerate(names))): xs = cols[i] ys = cols[j] print name1,name2,pearsonr(xs,ys),spearmanr(xs,ys)
def fit_motif(motif): n = len(motif) L = len(motif[0]) bio_ic = motif_ic(motif) def f((sigma,Ne,copies)): return expected_ic(sigma,Ne,L,copies)-bio_ic def fsq((sigma,Ne,copies)): return f((sigma,Ne,copies))**2 x0 = (1,2,n) fit_params = minimize(fsq,x0,bounds=((0,10),(1,None),(1,None)),method='TNC').x return fit_params
def main_experiment(samples=30, iterations=10000, delta_ic=0.1): results_dict = {} for tf_idx, tf in enumerate(tfdf.tfs): print "starting on:", tf motif = getattr(tfdf, tf) if motif_ic(motif) < 5: print "excluding", tf, "for low IC" continue bio_ic = motif_ic(motif) n = len(motif) L = len(motif[0]) matrix = matrix_from_motif(motif) sigma = sigma_from_matrix(matrix) mu = approximate_mu(matrix, n, G) Ne = estimate_Ne(matrix, mu, n, bio_ic) spoofs = [] ar = 0 spoof_trials = 0.0 while len(spoofs) < samples: spoof_trials += 1 matrix, chain = sella_hirsch_mh(Ne=Ne, mu=mu, n=1, matrix=sample_matrix(L, sigma), init='ringer', iterations=iterations) spoof_motif = concat( [random.choice(chain[iterations / 2:]) for i in range(n)]) if abs(motif_ic(spoof_motif) - bio_ic) < delta_ic: spoofs.append(spoof_motif) ar += 1 print "spoof acceptance rate:", ar / spoof_trials, len( spoofs), samples, spoof_trials #spoofs = [chain[-1] for (spoof_matrix,chain,Ne) in [spoof_motif(motif,Ne) for i in range(samples)]] results_dict[tf] = { fname: map(eval(fname), spoofs) for fname in "motif_ic motif_gini total_motif_mi".split() } print "finished:", tf, "(%s/%s)" % (tf_idx, len(tfdf.tfs)) print bio_ic, mean_ci(results_dict[tf]['motif_ic']) return results_dict
def motif_degradation_experiment(): """what is the effect of repeatedly inferring a motif from selected sites?""" from motifs import Escherichia_coli motif = Escherichia_coli.LexA n = len(motif) matrix = matrix_from_motif(motif) assumed_copies = 10 * n mu = approximate_mu(matrix, assumed_copies, G) for i in range(10): print i, "motif ic:", motif_ic(motif) motif = select_sites_by_occupancy(matrix, mu, n) matrix = matrix_from_motif(motif)
def interpret_main_experiment(results_dict): taus = sorted(results_dict.keys()) print taus data = [(tau, f, motif_ic(extract_sites(s)), total_motif_mi(extract_sites(s))) for tau in taus for (s, f) in results_dict[tau][0]] cols = transpose(data) names = "tau,f,motif_ic,total_motif_mi".split(",") for (i, name1), (j, name2) in choose2(list(enumerate(names))): xs = cols[i] ys = cols[j] print name1, name2, pearsonr(xs, ys), spearmanr(xs, ys)
def plot_mono_vs_di_likelihood(ll_dict = None): if ll_dict is None: ll_dict = likelihood_dict() normed_dict = {tf:tuple(map(lambda x:x/float(len(getattr(Escherichia_coli,tf))*len(getattr(Escherichia_coli,tf)[0])),(mono,di))) for (tf,(mono,di)) in ll_dict.items()} plt.scatter(*transpose(ll_dict.values())) for (tf,(mono,di)) in ll_dict.items(): sites = getattr(Escherichia_coli,tf) text = "%s\n#:%s\nw:%s\nIC:%1.2f" % (tf,len(sites),len(sites[0]),motif_ic(sites)) plt.annotate(text,(mono,di)) min_val = min(concat(ll_dict.values())) max_val = max(concat(ll_dict.values())) plt.xlabel("Mono LL") plt.ylabel("Di LL") plt.plot([min_val,max_val],[min_val,max_val],linestyle="--")
def motif_ls_sq_surface_experiment(): motif = Escherichia_coli.LexA L = len(motif[0]) bio_ic = motif_ic(motif) sigma,Ne,copies = fit_motif(motif) fit_ic = expected_ic(sigma,Ne,L,copies) f = lambda (sigma,Ne,copies):expected_ic(sigma,Ne,L,copies) gr = compute_grad(f,(sigma,Ne,copies),epsilon=0.0001) hess = compute_hessian(f,(sigma,Ne,copies)) lambs, vs = np.linalg.eig(hess) def orth_comp(y,z): a,b,c = gr x = -(b*y+c*z)/a return np.array([x,y,z])
def interpret_estremo_chain(chain, mu=-10, Ne=5): nu = Ne - 1 def log_f((code, motif)): eps = map(lambda x: -log(x), pw_prob_sites(motif, code)) return sum(nu * log(1 / (1 + exp(ep - mu))) for ep in eps) spoofs = [spoof_maxent_motifs(motif, 10) for code, motif in tqdm(chain)] plt.plot([motif_ic(motif) for (code, motif) in tqdm(chain)]) plt.plot([motif_mi(motif) for (code, motif) in tqdm(chain)]) plt.plot([mean(map(motif_ic, motifs)) for motifs in tqdm(spoofs)]) plt.plot([mean(map(motif_mi, motifs)) for motifs in tqdm(spoofs)]) plt.plot([indep_measure(code) for (code, motif) in tqdm(chain)]) plt.plot(map(log_f, chain))
def spoof_motif(motif, T): n = len(motif) L = len(motif[0]) bio_ic = motif_ic(motif) sigma = 2 * mean(map(sd, make_pssm(motif))) # XXX REVSIT THIS ISSUE ic_from_Ne = lambda Ne: predict_stat(n, L, sigma, Ne, G=5 * 10**6, T=lambda rho: mean_ic_from_rho( rho, n, L)) Ne = bisect_interval(lambda Ne: ic_from_Ne(Ne) - bio_ic, 0.01, 5) return predict_stat(n, L, sigma, Ne, T)
def collapsed_moran_process(N,turns,init=sample_species,mutate=mutate,fitness=fitness,ancestor=None,modulus=100): if ancestor is None: ancestor = sample_species() f = fitness(ancestor) hist = [] for turn in xrange(turns): prop = mutate(ancestor) fp = fitness(prop) if f == fp: continue num = (1-f/fp) denom = (1-(f/fp)**N) transition_prob = num/denom # print f,fp # print num,denom # print transition_prob if random.random() < transition_prob: ancestor = prop f = fp if turn % modulus == 0: print (turn,f,f,motif_ic(ancestor[0],correct=False),rec_h(ancestor[1])) hist.append((turn,f,f,motif_ic(ancestor[0]),rec_h(ancestor[1]))) return ancestor,hist
def basic_statistics(tfdf=None,filename="basic_motif_statistics.png"): if tfdf is None: tfdf = extract_motif_object_from_tfdf() motifs = [getattr(tfdf,tf) for tf in tfdf.tfs] Ls = [len(motif[0]) for motif in motifs] ns = [len(motif) for motif in motifs] ics = [motif_ic(motif) for motif in motifs] ic_density = [ic/L for ic,L in zip(ics,Ls)] sigmas = [mean(map(sd,make_pssm(motif))) for motif in motifs] ginis = [motif_gini(motif,correct=False) for motif in motifs] mi_density = [total_motif_mi(motif)/choose(L,2) for motif,L in zip(motifs,Ls)] plt.subplot(2,3,1) #plt.tick_params(axis='x',pad=15) plt.xticks(rotation=90) plt.hist(Ls) plt.xlabel("Length (bp)") plt.subplot(2,3,2) #plt.tick_params(axis='x',pad=30) plt.xticks(rotation=90) plt.hist(ns) plt.xlabel("Number of sites") plt.subplot(2,3,3) plt.hist(ics) plt.xticks(rotation=90) plt.xlabel("IC (bits)") plt.subplot(2,3,4) #plt.tick_params(axis='x',pad=30) plt.xticks(rotation=90) plt.hist(ic_density) plt.xlabel("IC Density (bits/bp)") plt.subplot(2,3,5) #plt.tick_params(axis='x',pad=15) plt.xticks(rotation=90) plt.hist(ginis) plt.xlabel("Gini coeff") plt.subplot(2,3,6) #plt.tick_params(axis='x',pad=30) plt.xticks(rotation=90) plt.hist(mi_density) plt.xlabel("MI Density (bits/comparison)") plt.tight_layout() if filename: plt.savefig(filename,dpi=600) plt.close()
def test_predict_ic(trials=100): pred_ics = [] obs_ics = [] for trial in trange(trials): sigma = random.random() * 5 + 0.1 L = random.randrange(5, 15) matrix = sample_matrix(L, sigma) mu = random.random() * (-20) Ne = random.random() * 5 + 1 pred_ic = predict_ic(matrix, mu, Ne) obs_ic = motif_ic(sample_motif_cftp(matrix, mu, Ne, n=100)) pred_ics.append(pred_ic) obs_ics.append(obs_ic) r, p = scatter(pred_ics, obs_ics) print r, p
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None): N = len(motif) L = len(motif[0]) des_ic = motif_ic(motif) chain = evo_ic_sample_motif2(N, L, des_ic, iterations=iterations, verbose=False, theta=theta) motifs = [ sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N) for (sigma, mu, Ne) in tqdm(chain) ] return chain, motifs
def test(): L = 10 matrix = [[-1,0,0,0] for i in range(L)] ringer_site = "A"*L n = 10 trials = 10 mus = range(-9,1,1) Nes = range(2,11,1) ics = [[mean(motif_ic(sample_motif(matrix, show(mu), Ne, ringer_site, n)) for i in range(trials)) for mu in tqdm(mus)] for Ne in Nes] plt.contour(mus,Nes,ics) plt.colorbar(label="IC") plt.xlabel("Mu") plt.ylabel("Nes")
def find_beta(N, L, des_ic, iterations=100, verbose=False): #des_ic_per_col = des_ic / float(L) beta = 1 alpha = 1.0 beta_hist = [] ic_hist = [] for i in range(1,iterations+1): #beta = beta + alpha/i * (des_ic_per_col - motif_ic(rmotif(N, 1, beta))) obs_ic = motif_ic(rmotif(N, L, beta)) beta_hist.append(beta) ic_hist.append(obs_ic) beta = beta + alpha/i * (des_ic - obs_ic) if verbose: print i, beta, obs_ic return beta, beta_hist, ic_hist
def plot_results_dict_gini_vs_ic(results_dict,filename=None): for i,k in enumerate(results_dict): g1,g2,tf = k.split("_") genome = g1 + "_" + g2 bio_motif = extract_tfdf_sites(genome,tf) bio_ic = motif_ic(bio_motif) bio_gini = motif_gini(bio_motif) d = results_dict[k] plt.scatter(bio_ic,bio_gini,color='b',label="Bio"*(i==0)) plt.scatter(mean(d['maxent']['motif_ic']),mean(d['maxent']['motif_gini']),color='g',label='ME'*(i==0)) plt.scatter(mean(d['uniform']['motif_ic']),mean(d['uniform']['motif_gini']),color='r',label="TURS"*(i==0)) plt.xlabel("IC (bits)") plt.ylabel("Gini Coefficient") plt.legend() maybesave(filename)
def spoof_motifs(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10*N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) epsilon = (1+double_sigma)*sigma # 15 Jan 2016 print "sigma:", sigma bio_ic = motif_ic(motif) def f(Ne): ps = ps_from_copies(sigma, Ne, L, copies) motifs = [sample_motif(epsilon, Ne, L, copies, n,ps=ps) for i in range(trials)] return mean(map(motif_ic,motifs)) - bio_ic Ne = log_regress_spec2(f,[1,10],tol=10**-3) return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def extract_motif_object_from_tfdf(): obj = Organism() genomes = set(tfdf['genome_accession']) setattr(obj,"tfs",[]) for genome in genomes: tfs = set(tfdf[tfdf['genome_accession'] == genome]['TF']) for tf in tfs: print tf,genome if not type(tf) is str: continue tf_name = genome + "_" + tf sites = extract_tfdf_sites(genome,tf) if len(sites) >= 10 and motif_ic(sites) > 5: setattr(obj,tf_name,sites) obj.tfs.append(tf_name) return obj
def rfreq_rseq_experiment(obj,filename="rfreq_vs_rseq_in_sefas_collection.png"): Rfreqs = [] Rseqs = [] G = 5.0*10**6 min_rfreq = log2(G/500) for tf in obj.tfs: motif = getattr(obj,tf) Rfreqs.append(log(G/len(motif),2)) Rseqs.append(motif_ic(motif)) plt.scatter(Rfreqs,Rseqs) plt.xlabel("log(G/n) (bits)") plt.ylabel("Motif Information Content (bits)") plt.plot([0,20],[0,20],linestyle='--',label='Theory') plt.plot([min_rfreq,min_rfreq],[0,30],linestyle='--',label='Maximum Plausible Regulon Size') plt.title("Motif Information Content vs. Search Difficulty") plt.legend(loc='upper left') maybesave(filename)
def make_gle_evo_sim_spoofs(bio_motifs, trials_per_motif = 3): start_time = time.time() spoofs = [] failures = 0 for it, motif in enumerate(tqdm(bio_motifs, desc='bio_motifs')): bio_ic = motif_ic(motif) these_spoofs = [spoof_motif_gle(motif,num_motifs=10, Ne_tol=10**-2) for i in range(trials_per_motif)] spoofs.append(these_spoofs) spoof_ics = map(motif_ic, concat(these_spoofs)) lb, ub = mean_ci(spoof_ics) out_of_bounds = (not (lb <= bio_ic <= ub)) failures += out_of_bounds fail_perc = failures/float(it+1) print it,"bio_ic:", bio_ic, "spoof_ci: (%s,%s)" % (lb, ub), "*" * out_of_bounds,"failures:","%1.2f" % fail_perc stop_time = time.time() print "total time:", stop_time - start_time return spoofs
def results_of_analyze_bio_motifs(results): # IC Ls = np.array([len(getattr(Escherichia_coli,tf)[0]) for tf in Escherichia_coli.tfs]) Ls_choose_2 = np.array([choose(L,2) for L in Ls]) bio_ics = np.array([motif_ic(getattr(Escherichia_coli,tf)) for tf in Escherichia_coli.tfs]) sim_ics = np.array([results[tf][0] for tf in Escherichia_coli.tfs]) sim_ic_errs = np.array([1.96*results[tf][1] for tf in Escherichia_coli.tfs]) bio_ics_norm = bio_ics/Ls sim_ics_norm = sim_ics/Ls sim_ic_norm_errs = sim_ic_errs/Ls bio_ginis = np.array([motif_gini(getattr(Escherichia_coli,tf)) for tf in Escherichia_coli.tfs]) sim_ginis = np.array([results[tf][2] for tf in Escherichia_coli.tfs]) sim_gini_errs = np.array([1.96*results[tf][3] for tf in Escherichia_coli.tfs]) bio_mis_norm = np.array([total_motif_mi(getattr(Escherichia_coli,tf))/choose(L,2) for tf,L in zip(Escherichia_coli.tfs,Ls)]) sim_mis_norm = np.array([results[tf][4]/choose(L,2) for tf,L in zip(Escherichia_coli.tfs,Ls)]) sim_mis_norm_errs = np.array([1.96*results[tf][5]/choose(L,2) for tf,L in zip(Escherichia_coli.tfs,Ls)]) plt.subplot(1,4,1) plt.errorbar(bio_ics,sim_ics, yerr=sim_ic_errs,fmt='o') plt.plot([0,20],[0,20]) plt.xlabel("IC") plt.subplot(1,4,2) plt.errorbar(bio_ics_norm,sim_ics_norm, yerr=sim_ic_norm_errs,fmt='o') plt.plot([0,2],[0,2]) plt.xlabel("IC/base") plt.subplot(1,4,3) plt.errorbar(bio_ginis,sim_ginis, yerr=sim_gini_errs,fmt='o') plt.plot([0,1],[0,1]) plt.xlabel("Gini coefficient") plt.subplot(1,4,4) plt.errorbar(bio_mis_norm,sim_mis_norm, yerr=sim_mis_norm_errs,fmt='o') plt.plot([0,0.5],[0,0.5]) plt.xlabel("MI/pair") print "IC:", pearsonr(bio_ics, sim_ics) print "normalized IC:", pearsonr(bio_ics_norm, sim_ics_norm) print "Gini:", pearsonr(bio_ginis, sim_ginis) print "normalized MI:", pearsonr(bio_mis_norm, sim_mis_norm)
def tfdf_experiment(replicates=1000,delta_ic=0.1,tolerance=10**-5): genomes = set(tfdf['genome_accession']) results_dict = defaultdict(lambda:defaultdict(dict)) for genome_idx,genome in enumerate(genomes): print "genome:",genome, genome_idx,len(genomes) tfs = set(tfdf[tfdf['genome_accession'] == genome]['TF']) for tf_idx,tf in enumerate(tfs): if not type(tf) is str: continue print "tf:",tf,tf_idx,len(tfs) print genome,tf bio_motif = extract_tfdf_sites(genome,tf) if len(bio_motif) < 10: print "skipping:" continue tf_name = genome + "_" + tf n,L = motif_dimensions(bio_motif) print "dimensions:",n,L bio_ic = motif_ic(bio_motif) bio_gini = motif_gini(bio_motif) bio_mi = total_motif_mi(bio_motif) results_dict[tf_name]["bio"]["motif_ic"] = bio_ic results_dict[tf_name]["bio"]["motif_gini"] = bio_gini results_dict[tf_name]["bio"]["total_motif_mi"] = bio_mi correction_per_col = 3/(2*log(2)*n) desired_ic = bio_ic + L * correction_per_col t = time.time() beta = find_beta_for_mean_motif_ic(n,L,desired_ic,tolerance=tolerance) beta_time = time.time() - t print "beta, time:",beta, beta_time print "maxent sampling" maxent = maxent_motifs_with_ic(n,L,bio_ic,replicates,beta=beta) print "uniform sampling" uniform = uniform_motifs_with_ic(n,L,bio_ic,replicates,epsilon=delta_ic,beta=beta) #print "envelope sampling" #envelope = envelope_sample_motifs_with_ic(n,L,bio_ic,delta_ic,replicates,beta=beta) for spoof_name in "maxent uniform".split(): spoofs = eval(spoof_name) for motif_statname in "motif_ic motif_gini total_motif_mi".split(): print "recording results for:",spoof_name,motif_statname motif_stat = eval(motif_statname) results_dict[tf_name][spoof_name][motif_statname] = map(motif_stat,spoofs) return results_dict
def spoof_motif_ar(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4): n = len(motif) L = len(motif[0]) copies = 10*n if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) print "sigma:", sigma bio_ic = motif_ic(motif) matrix = sample_matrix(L, sigma) mu = approx_mu(matrix, copies=10*n, G=5*10**6) print "mu:", mu def f(Ne): motifs = [sample_motif_ar(matrix, mu, Ne, n) for i in trange(trials)] return mean(map(motif_ic,motifs)) - bio_ic x0 = 2 print "Ne guess:", x0 Ne = bisect_interval_noisy(f,x0=x0,iterations=100,lb=1, verbose=False,w=0.5) print "Ne:",Ne return [sample_motif_ar(matrix, mu, Ne, n) for _ in trange(num_motifs)]
def Ne_from_motif(bio_motif,interp_rounds,iterations=50000): """Given a motif, return Ne that matches mean IC""" bio_ic = motif_ic(bio_motif) n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] print len(matrix) def f(Ne,iterations=iterations): print "Ne",Ne _,chain = sella_hirsch_mh(matrix=matrix,n=n,Ne=Ne,iterations=iterations,init='ringer') return mean(map(motif_ic,chain[iterations/2:])) - bio_ic # lo,hi = 1,5 # data = [] # for _ in xrange(interp_rounds): # guess = (lo + hi)/2.0 # y = f(guess) # print lo,hi,guess,y # data.append((guess,y)) # if y > 0: # hi = guess # else: # lo = guess # return data Ne_min = 1 Ne_max = 5 while f(Ne_max) < 0: print "increasing Ne max" Ne_max *= 2 xs, ys= transpose([(Ne,f(Ne)) for Ne in np.linspace(Ne_min,Ne_max,interp_rounds)]) # now find an interpolant. We desire smallest sigma of gaussian # interpolant such that function has at most one inflection point interp_sigmas = np.linspace(0.01,1,100) interps = [gaussian_interp(xs,ys,sigma=s) for s in interp_sigmas] for i,(sigma, interp) in enumerate(zip(interp_sigmas,interps)): print i,sigma if num_inflection_points(map(interp,np.linspace(Ne_min,Ne_max,100))) == 1: "found 1 inflection point" break print sigma Ne = bisect_interval(interp,Ne_min,Ne_max) return Ne
def moran_process(N=1000,turns=10000,init=sample_species,mutate=mutate,fitness=fitness,pop=None): if pop is None: pop = [(lambda spec:(spec,fitness(spec)))(sample_species()) for _ in trange(N)] hist = [] for turn in xrange(turns): fits = [f for (s,f) in pop] #print fits birth_idx = inverse_cdf_sample(range(N),fits,normalized=False) death_idx = random.randrange(N) #print birth_idx,death_idx mother,f = pop[birth_idx] daughter = mutate(mother) #print "mutated" pop[death_idx] = (daughter,fitness(daughter)) mean_fits = mean(fits) hist.append((f,mean_fits)) if turn % 10 == 0: mean_dna_ic = mean([motif_ic(sites,correct=False) for ((sites,eps),_) in pop]) mean_rec_h = mean([h_np(boltzmann(eps)) for ((dna,eps),_) in pop]) print turn,"sel_fit:",f,"mean_fit:",mean_fits,"mean_dna_ic:",mean_dna_ic,"mean_rec_h:",mean_rec_h return pop