def make_ecoli_df():
    Ls = []
    Ls_adj = []
    ns = []
    sigmas = []
    labels = []
    motif_ics = []
    motif_ics_per_base = []
    for tf in Escherichia_coli.tfs:
        sites = getattr(Escherichia_coli, tf)
        L = len(sites[0])
        n = len(sites)
        ns.append(n)
        L_adj = len(sites[0]) + log2(n)
        sigma = mean((map(sd, make_pssm(sites))))
        Ls.append(L)
        Ls_adj.append(L_adj)
        motif_ics.append(motif_ic(sites))
        motif_ics_per_base.append(motif_ic(sites) / float(L))
        sigmas.append(sigma)
    df = pd.DataFrame(
        {
            "L": Ls,
            "n": ns,
            "sigma": sigmas,
            "motif_ic": motif_ics,
            "info_density": motif_ics_per_base
        },
        index=Escherichia_coli.tfs)
    return df
Пример #2
0
def match_ic_mi(N,
                L,
                des_ic,
                des_mi,
                iterations=50000,
                take_stock=None,
                eta=0.01,
                alpha=1,
                beta=0):
    if take_stock is None:
        take_stock = int((N * L) * log(N * L))
    x = random_motif(L, N)
    xs = [None] * iterations
    ics = [0.0] * iterations
    mis = [0.0] * iterations
    alphas = [0.0] * iterations
    betas = [0.0] * iterations
    ic = motif_ic(x)
    mi = total_motif_mi(x)
    accepts = 0
    for i in xrange(iterations):
        # if i == iterations/2:
        #     eta *= 0.1
        xp = mutate_motif(x)
        icp = motif_ic(xp)
        mip = total_motif_mi(xp)
        log_y = (alpha * ic + beta * mi)
        log_yp = (alpha * icp + beta * mip)
        if log(random.random()) < log_yp - log_y:
            accepts += 1
            x = xp
            ic = icp
            mi = mip
        ics[i] = (ic)
        mis[i] = (mi)
        xs[i] = (x)
        #print sum(site.count("A") for site in x)

        alphas[i] = (alpha)
        betas[i] = (beta)
        if i > 0 and i % take_stock == 0:
            if i < iterations / 10:
                mean_ic = mean(ics[i - take_stock:i])
                mean_mi = mean(mis[i - take_stock:i])
                alpha += eta * (des_ic - mean_ic) * exp(
                    -i / (10 * float(iterations)))
                beta += eta * (des_mi - mean_mi) * exp(
                    -i / (10 * float(iterations)))
            else:
                mean_ic = mean(ics[i - take_stock:i])
                mean_mi = mean(mis[i - take_stock:i])
                alpha = poly1d(polyfit(ics[:i], alphas[:i], 1))(des_ic)
                beta = poly1d(polyfit(mis[:i], betas[:i], 1))(des_mi)
            fmt_string = (
                "mean ic: % 1.2f, mean mi: % 1.2f, alpha: % 1.2f, beta: % 1.2f"
                % (mean_ic, mean_mi, alpha, beta))
            print i, "AR:", accepts / (i + 1.0), fmt_string
    return xs, ics, mis, alphas, betas
Пример #3
0
def spoof_pmotifs(motif, num_motifs=10, trials=1):
    n = len(motif)
    L = len(motif[0])
    des_ic = motif_ic(motif)
    f = lambda p: -mean(
        motif_ic(pmotif(n, L, p)) - des_ic for i in range(trials))
    lb = 0
    ub = 0.75
    xs = np.linspace(lb, ub, 100)
    ys = map(f, xs)
    fhat = kde_regress(xs, ys)
    p = bisect_interval(fhat, lb, ub, verbose=False, tolerance=10**-3)
    return [pmotif(n, L, p) or _ in xrange(num_motifs)]
Пример #4
0
def flux_prob(motif, a, b, trials=10000):
    """determine probability of mutation pushing motif out of interval [a,b]"""
    ic = motif_ic(motif)
    lesser = 0
    same = 0
    greater = 0
    for i in trange(trials):
        icp = motif_ic(mutate_motif(motif))
        if icp < a:
            lesser += 1
        elif a <= icp < b:
            same += 1
        else:
            greater += 1
    return lesser, same, greater
def spoof_motif_cftp(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-2,verbose=False):
    n = len(motif)
    L = len(motif[0])
    copies = 10*n
    if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    print "sigma:", sigma
    bio_ic = motif_ic(motif)
    matrix = sample_matrix(L, sigma)
    mu = approx_mu(matrix, copies=10*n, G=5*10**6)
    print "mu:", mu
    def f(Ne):
        motifs = [sample_motif_cftp(matrix, mu, Ne, n, verbose=verbose)
                  for i in trange(trials)]
        return mean(map(motif_ic,motifs)) - bio_ic
    # lb = 1
    # ub = 10
    # while f(ub) < 0:
    #     ub *= 2
    #     print ub
    x0s = [2,10]#(lb + ub)/2.0
    # print "choosing starting seed for Ne"
    # fs = map(lambda x:abs(f(x)),x0s)
    # print "starting values:",x0s,fs
    # x0 = x0s[argmin(fs)]
    # print "chose:",x0
    # Ne = bisect_interval_noisy_ref(f,x0,lb=1,verbose=True)
    Ne = log_regress_spec2(f,x0s,tol=Ne_tol)
    print "Ne:",Ne
    return [sample_motif_cftp(matrix, mu, Ne, n) for _ in trange(num_motifs)]
def interpret_main_experiment(results_dict,named_fs=None):
    if named_fs is None:
        named_fs = [(fitness,"fitness"),(lambda org:motif_ic(extract_sites(org)),"Motif IC"),
                    (lambda org:total_motif_mi(extract_sites(org)),"Motif MI")]
    rec_muts,site_muts = map(lambda x:sorted(set(x)),transpose(results_dict.keys()))
    fs,names = transpose(named_fs)
    subplot_dimension = ceil(sqrt(len(fs)))
    for idx,f in enumerate(fs):
        mat = np.zeros((len(rec_muts),len(site_muts)))
        for i,rec_mut in enumerate(sorted(rec_muts)):
            for j,site_mut in enumerate(sorted(site_muts)):
                pop,hist = results_dict[(rec_mut,site_mut)]
                mat[i,j] = mean([f(x) for x,fit in pop])
                print i,j,mat[i,j]
        plt.subplot(subplot_dimension,subplot_dimension,idx)
        plt.imshow(mat,interpolation='none')
        plt.xticks(range(len(site_muts)),map(str,site_muts))
        plt.yticks(range(len(rec_muts)),map(str,rec_muts))
        #plt.yticks(rec_muts)
        plt.xlabel("site mutation rate")
        plt.ylabel("rec mutation rate")
        plt.colorbar()
        title = names[idx]
        plt.title(title)
    plt.show()
def Ne_scan(sigma, L, copies, trials=1, n=100, max_Ne=10, Ne_steps=100):
    Ne_range = np.linspace(1, max_Ne, Ne_steps)
    sigma = 1
    plt.subplot(1, 4, 1)
    obs_ics = map(
        lambda Ne: mean(
            motif_ic(sample_motif(sigma=sigma, Ne=Ne, L=L, n=n, copies=copies))
            for _ in range(trials)), Ne_range)
    pred_ics = map(lambda Ne: expected_ic(sigma, Ne, L, copies), Ne_range)
    occs = map(lambda Ne: expected_occupancy(sigma, Ne, L, copies), Ne_range)
    mismatches = map(lambda Ne: mismatch_probability(sigma, Ne, L, copies),
                     Ne_range)
    plt.plot(Ne_range, obs_ics)
    plt.plot(Ne_range, pred_ics)
    plt.plot(Ne_range, occs)
    plt.plot(Ne_range, mismatches)
    plt.subplot(1, 4, 2)
    plt.plot(mismatches, pred_ics)
    plt.xlabel("Mismatches")
    plt.ylabel("IC")
    plt.subplot(1, 4, 3)
    plt.plot(occs, pred_ics)
    plt.xlabel("Occupancy")
    plt.ylabel("IC")
    plt.subplot(1, 4, 4)
    plt.plot(mismatches, occs)
    plt.xlabel("Mismatches")
    plt.ylabel("Occs")
def sample_motif_cftp_param_study():
    """Examine dependence of IC on sigma, Ne"""
    grid_points = 10
    sigmas = np.linspace(0.5,10,grid_points)
    Nes = np.linspace(1,10,grid_points)
    trials = 3
    n = 20
    L = 10
    def f(sigma, Ne):
        matrix = sample_matrix(L, sigma)
        mu = approx_mu(matrix, 10*n)
        return motif_ic(sample_motif_cftp(matrix, mu, Ne, n))
    ics = [[(mean(f(sigma, Ne) for _ in range(trials)))
            for sigma in sigmas] for Ne in tqdm(Nes,desc="ic grid")]
    plt.contourf(sigmas, Nes,ics)
    plt.colorbar()
    #bio_motifs = [getattr(Escherichia_coli,tf) for tf in Escherichia_coli.tfs]
    bio_sigmas = [sigma_from_matrix(pssm_from_motif(motif,pc=1))
                  for motif in bio_motifs]
    bio_ics = [motif_ic(motif) for motif in bio_motifs]
    #griddata((sigmas,Nes),ics)
    interp = interp2d(sigmas,Nes,ics)
    bio_Nes = [bisect_interval(lambda Ne:interp(show(bio_sigma),Ne)-bio_ic,0,20)
               for bio_sigma, bio_ic in zip(bio_sigmas,bio_ics)]
    plt.scatter(sigm)
def interpret_main_experiment(results_dict, named_fs=None):
    if named_fs is None:
        named_fs = [(fitness, "fitness"),
                    (lambda org: motif_ic(extract_sites(org)), "Motif IC"),
                    (lambda org: total_motif_mi(extract_sites(org)),
                     "Motif MI")]
    rec_muts, site_muts = map(lambda x: sorted(set(x)),
                              transpose(results_dict.keys()))
    fs, names = transpose(named_fs)
    subplot_dimension = ceil(sqrt(len(fs)))
    for idx, f in enumerate(fs):
        mat = np.zeros((len(rec_muts), len(site_muts)))
        for i, rec_mut in enumerate(sorted(rec_muts)):
            for j, site_mut in enumerate(sorted(site_muts)):
                pop, hist = results_dict[(rec_mut, site_mut)]
                mat[i, j] = mean([f(x) for x, fit in pop])
                print i, j, mat[i, j]
        plt.subplot(subplot_dimension, subplot_dimension, idx)
        plt.imshow(mat, interpolation='none')
        plt.xticks(range(len(site_muts)), map(str, site_muts))
        plt.yticks(range(len(rec_muts)), map(str, rec_muts))
        #plt.yticks(rec_muts)
        plt.xlabel("site mutation rate")
        plt.ylabel("rec mutation rate")
        plt.colorbar()
        title = names[idx]
        plt.title(title)
    plt.show()
Пример #10
0
def moran_process(N=1000,
                  turns=10000,
                  init=sample_species,
                  mutate=mutate,
                  fitness=fitness,
                  pop=None):
    if pop is None:
        pop = [(lambda spec: (spec, fitness(spec)))(sample_species())
               for _ in trange(N)]
    hist = []
    for turn in xrange(turns):
        fits = [f for (s, f) in pop]
        #print fits
        birth_idx = inverse_cdf_sample(range(N), fits, normalized=False)
        death_idx = random.randrange(N)
        #print birth_idx,death_idx
        mother, f = pop[birth_idx]
        daughter = mutate(mother)
        #print "mutated"
        pop[death_idx] = (daughter, fitness(daughter))
        mean_fits = mean(fits)
        hist.append((f, mean_fits))
        if turn % 10 == 0:
            mean_dna_ic = mean(
                [motif_ic(sites, correct=False) for ((sites, eps), _) in pop])
            mean_rec_h = mean(
                [h_np(boltzmann(eps)) for ((dna, eps), _) in pop])
            print turn, "sel_fit:", f, "mean_fit:", mean_fits, "mean_dna_ic:", mean_dna_ic, "mean_rec_h:", mean_rec_h
    return pop
Пример #11
0
def main(prok_motifs, euk_motifs, filename='motif_summary_stats.eps'):
    sns.set(style="darkgrid", color_codes=True)
    #df = pd.DataFrame(columns="Type N L IC Gini".split(), index=range(len(prok_motifs) + len(euk_motifs)))
    df = pd.DataFrame()
    df['Domain'] = ["Eukaryotic" for _ in euk_motifs
                    ] + ["Prokaryotic" for _ in prok_motifs]
    motifs = euk_motifs + prok_motifs
    df['N'] = [log(len(motif)) / log(10) for motif in motifs]
    df['L (bp)'] = [len(motif[0]) for motif in motifs]
    df['IC (bits)'] = [motif_ic(motif) for motif in motifs]
    df['IGC'] = [motif_gini(motif) for motif in motifs]
    pg = sns.pairplot(df,
                      hue='Domain',
                      markers='s o'.split(),
                      palette='cubehelix')
    #hue_order=["Prokaryotic", "Eukaryotic"])
    for i in range(4):
        pg.axes[i][3].set_xlim(-0.01, 0.6)
    for j in range(4):
        pg.axes[3][j].set_ylim(-0.01, 0.6)
    pg.axes[0][0].set_yticks(range(1, 5))
    pg.axes[0][0].set_yticklabels(["$10^%i$" % i for i in range(1, 5)])
    pg.axes[3][0].set_xticks(range(1, 5))
    pg.axes[3][0].set_xticklabels(["$10^%i$" % i for i in range(1, 5)])
    maybesave(filename)
def spoof_motifs(motif,
                 num_motifs=10,
                 trials=1,
                 sigma=None,
                 Ne_tol=10**-4,
                 double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10 * N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1))
    epsilon = (1 + double_sigma) * sigma  # 15 Jan 2016
    print "sigma:", sigma
    bio_ic = motif_ic(motif)

    def f(Ne):
        ps = ps_from_copies(sigma, Ne, L, copies)
        motifs = [
            sample_motif(epsilon, Ne, L, copies, n, ps=ps)
            for i in range(trials)
        ]
        return mean(map(motif_ic, motifs)) - bio_ic

    Ne = log_regress_spec2(f, [1, 10], tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def Ne_scan(sigma,L,copies,trials=1,n=100,max_Ne=10,Ne_steps=100):
    Ne_range = np.linspace(1,max_Ne,Ne_steps)
    sigma = 1
    plt.subplot(1,4,1)
    obs_ics = map(lambda Ne:mean(motif_ic(sample_motif(sigma=sigma,Ne=Ne,L=L,n=n,copies=copies))
                                   for _ in range(trials)), Ne_range)
    pred_ics = map(lambda Ne:expected_ic(sigma,Ne,L,copies),Ne_range)
    occs = map(lambda Ne:expected_occupancy(sigma,Ne,L,copies),Ne_range)
    mismatches = map(lambda Ne:mismatch_probability(sigma,Ne,L,copies),Ne_range)
    plt.plot(Ne_range,obs_ics)
    plt.plot(Ne_range,pred_ics)
    plt.plot(Ne_range,occs)
    plt.plot(Ne_range,mismatches)
    plt.subplot(1,4,2)
    plt.plot(mismatches,pred_ics)
    plt.xlabel("Mismatches")
    plt.ylabel("IC")
    plt.subplot(1,4,3)
    plt.plot(occs,pred_ics)
    plt.xlabel("Occupancy")
    plt.ylabel("IC")
    plt.subplot(1,4,4)
    plt.plot(mismatches,occs)
    plt.xlabel("Mismatches")
    plt.ylabel("Occs")
def sigma_scan(Ne,L,copies,trials=1,n=100,sigma_steps=100,max_sigma=10):
    sigma_range = np.linspace(1,max_sigma,sigma_steps)
    sigma = 1
    plt.subplot(1,4,1)
    obs_ics = map(lambda sigma:mean(motif_ic(sample_motif(sigma=sigma,Ne=Ne,L=L,n=n,copies=copies))
                                   for _ in range(trials)), sigma_range)
    pred_ics = map(lambda sigma:expected_ic(sigma,Ne,L,copies),sigma_range)
    occs = map(lambda sigma:expected_occupancy(sigma,Ne,L,copies),sigma_range)
    mismatches = map(lambda sigma:mismatch_probability(sigma,Ne,L,copies),sigma_range)
    mus = map(lambda sigma:mu_from(G,sigma,L,copies),sigma_range)
    approx_mus = map(lambda sigma:approx_mu(G,sigma,L,copies),sigma_range)
    mean_log_Zbs = map(lambda sigma:log(mean_Zb(sigma,L)),sigma_range)
    plt.plot(sigma_range,obs_ics)
    plt.plot(sigma_range,pred_ics)
    plt.plot(sigma_range,occs)
    plt.plot(sigma_range,mismatches)
    plt.plot(sigma_range,mus)
    plt.plot(sigma_range,approx_mus)
    plt.plot(sigma_range,mean_log_Zbs)
    plt.subplot(1,4,2)
    plt.plot(mismatches,pred_ics)
    plt.xlabel("Mismatches")
    plt.ylabel("IC")
    plt.subplot(1,4,3)
    plt.plot(pred_ics,occs)
    plt.xlabel("IC")
    plt.ylabel("Occupancy")
    plt.subplot(1,4,4)
    plt.plot(mismatches,occs)
    plt.xlabel("Mismatches")
    plt.ylabel("Occs")
def biological_experiment(replicates=1000):
    delta_ic = 0.1
    results_dict = defaultdict(lambda:defaultdict(dict))
    for tf_idx,tf in enumerate(Escherichia_coli.tfs):
        print tf,"(%s/%s)" % (tf_idx,len(Escherichia_coli.tfs))
        bio_motif = getattr(Escherichia_coli,tf)
        n,L = motif_dimensions(bio_motif)
        bio_ic = motif_ic(bio_motif)
        bio_gini = motif_gini(bio_motif)
        bio_mi = total_motif_mi(bio_motif)
        results_dict[tf]["bio"]["motif_ic"] = bio_ic
        results_dict[tf]["bio"]["motif_gini"] = bio_gini
        results_dict[tf]["bio"]["total_motif_mi"] = bio_mi
        beta = find_beta_for_mean_motif_ic(n,L,bio_ic)
        maxent = maxent_motifs_with_ic(n,L,bio_ic,replicates)
        #maxent_truncated = maxent_truncated_sample_motifs_with_ic(n,L,bio_ic,delta_ic,replicates,beta=beta)
        uniform = uniform_motifs_with_ic(n,L,bio_ic,delta_ic,replicates)
        #chain_spoofs = chain_sample_motifs_with_ic(n,L,bio_ic,delta_ic,replicates,beta=beta)
        #for spoof_name in "maxent maxent_truncated envelope".split():
        for spoof_name in "maxent uniform".split():
            spoofs = eval(spoof_name)
            for motif_statname in "motif_ic motif_gini total_motif_mi".split():
                motif_stat = eval(motif_statname)
                results_dict[tf][spoof_name][motif_statname] = map(motif_stat,spoofs)
        #all_spoofs = [maxent_spoofs,maxent_truncated_spoofs,envelope_spoofs]#,chain_spoofs]
        # print "IC:",bio_ic,map(lambda xs:val_in_coverage(bio_ic,xs),mmap(motif_ic,all_spoofs))
        # print "Gini:",bio_gini,map(lambda xs:val_in_coverage(bio_gini,xs),mmap(motif_gini,all_spoofs))
        # print "MI:",bio_mi,map(lambda xs:val_in_coverage(bio_mi,xs),mmap(total_motif_mi,all_spoofs))
    return results_dict
def plot_results_dict_gini_qq(results_dict,filename=None):
    bios = []
    maxents = []
    uniforms = []
    for i,k in enumerate(results_dict):
        g1,g2,tf = k.split("_")
        genome = g1 + "_" + g2
        bio_motif = extract_tfdf_sites(genome,tf)
        bio_ic = motif_ic(bio_motif)
        bio_gini = motif_gini(bio_motif)
        d = results_dict[k]
        bios.append(bio_gini)
        maxents.append(mean(d['maxent']['motif_gini']))
        uniforms.append(mean(d['uniform']['motif_gini']))
    plt.scatter(bios,maxents,label='ME')
    plt.scatter(bios,uniforms,label='TURS',color='g')
    minval = min(bios+maxents+uniforms)
    maxval = max(bios+maxents+uniforms)
    plt.plot([minval,maxval],[minval,maxval],linestyle='--')
    plt.xlabel("Observed Gini Coefficient")
    plt.ylabel("Mean Sampled Gini Coefficient")
    plt.legend(loc='upper left')
    print "bio vs maxent:",pearsonr(bios,maxents)
    print "bio vs uniform:",pearsonr(bios,uniforms)
    maybesave(filename)
def gini_of_LGEs_experiment(iterations=50000,
                            Ne=200,
                            sigma=1,
                            lge_replicates=20):
    """Do motifs evolved under LGEs show more or less gini coefficient
    than IC-matched maxent counterparts?"""
    n = 16
    L = 16
    maxent_replicates = 1000
    lge_motifs = []
    lge_matrices = []
    maxent_motifss = []
    for i in range(lge_replicates):
        matrix, chain = sella_hirsch_mh(Ne=Ne,
                                        n=n,
                                        L=L,
                                        G=G,
                                        sigma=sigma,
                                        iterations=iterations,
                                        init='ringer')
        lge_motif = chain[-1]
        desired_ic = motif_ic(lge_motif)
        maxent_motifs = maxent_sample_motifs_with_ic(
            n, L, desired_ic, replicates=maxent_replicates)
        lge_matrices.append(matrix)
        lge_motifs.append(lge_motif)
        maxent_motifss.append(maxent_motifs)
    return lge_matrices, lge_motifs, maxent_motifss
Пример #18
0
def moran_process(N=1000,turns=10000,mean_site_muts=1,mean_rec_muts=1,init=sample_species,mutate=mutate,
                  fitness=fitness,pop=None,print_modulus=100,hist_modulus=10):
    #ringer = (np.array([1]+[0]*(K-1)),sample_eps())
    if pop is None:
        pop = [(lambda spec:(spec,fitness(spec)))(init())
               for _ in trange(N)]
    # ringer = make_ringer()
    # pop[0] = (ringer,fitness(ringer))
    #pop = [(ringer,fitness(ringer)) for _ in xrange(N)]
    site_mu = min(1/float(n*L) * mean_site_muts,1)
    rec_mu = min(1/float(K) * mean_rec_muts,1)
    hist = []
    for turn in xrange(turns):
        fits = [f for (s,f) in pop]
        #print fits
        birth_idx = inverse_cdf_sample(range(N),fits,normalized=False)
        if birth_idx is None:
            return pop
        death_idx = random.randrange(N)
        #print birth_idx,death_idx
        mother,f = pop[birth_idx]
        daughter = mutate(mother,site_mu,rec_mu)
        #print "mutated"
        pop[death_idx] = (daughter,fitness(daughter))
        mean_fits = mean(fits)
        #hist.append((f,mean_fits))
        if turn % hist_modulus == 0:
            mean_dna_ic = mean([motif_ic(sites,correct=False) for ((sites,eps),_) in pop])
            mean_rec = mean([recognizer_promiscuity(x) for (x,f) in pop])
            mean_recced = mean([sites_recognized((dna,rec)) for ((dna,rec),_) in pop])
            hist.append((turn,f,mean_fits,mean_dna_ic,mean_rec,mean_recced))
            if turn % print_modulus == 0:
                print turn,"sel_fit:",f,"mean_fit:",mean_fits,"mean_dna_ic:",mean_dna_ic,"mean_rec_prom:",mean_rec
    return pop,hist
Пример #19
0
def L_vs_sigma_plot(filename=None, with_bio=False):
    if with_bio:
        tfdf = extract_motif_object_from_tfdf()
        motifs = [getattr(tfdf, tf) for tf in tfdf.tfs]
        Ls = [len(motif[0]) for motif in motifs]
        cs = [len(motif) for motif in motifs]
        ics = [motif_ic(motif) for motif in motifs]
        ic_density = [ic / L for ic, L in zip(ics, Ls)]
        sigmas = [mean(map(sd, make_pssm(motif))) for motif in motifs]
        ginis = [motif_gini(motif, correct=False) for motif in motifs]
        mi_density = [
            total_motif_mi(motif) / choose(L, 2)
            for motif, L in zip(motifs, Ls)
        ]
    min_sigma = 0.1
    max_sigma = 10
    plt.xlim(0, max_sigma)
    plt.ylim(0, 60)
    plt.plot(*pl(crit_L, np.linspace(min_sigma, max_sigma, 1000)),
             label="Binding Transition")
    plt.plot([min_sigma, max_sigma],
             [log(G, 2) / 2, log(G, 2) / 2],
             linestyle='--',
             label="Info Theory Threshold")
    # plt.plot(*pl(lambda sigma:log(G)/sigma,np.linspace(min_sigma,max_sigma,1000)),
    #          linestyle='--',label="Zero Discrimination Asymptote")
    if with_bio:
        plt.scatter(sigmas, Ls, label="Biological Motifs")
    plt.xlabel("sigma")
    plt.ylabel("L")
    plt.legend()
    maybesave(filename)
Пример #20
0
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None):
    N = len(motif)
    L = len(motif[0])
    des_ic = motif_ic(motif)
    chain = evo_ic_sample_motif2(N, L, des_ic, iterations=iterations, verbose=False, theta=theta)
    motifs = [sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N) for (sigma, mu, Ne) in tqdm(chain)]
    return chain, motifs
Пример #21
0
def train_pairwise_model2(motif,
                          pc=1 / 16.0,
                          decay_timescale=10000,
                          take_stock=1000,
                          eta=0.01,
                          stop_crit=0.01):
    L = len(motif[0])
    N = len(motif)
    fs = get_pairwise_freqs(motif, pc=pc)
    ws = [{(b1, b2): 0
           for (b1, b2) in dinucs} for _ in range(int(choose(L, 2)))]
    iteration = 0
    while True:
        cur_motif = [
            sample_model(ws, x0=site, iterations=10 * L)[-1] for site in motif
        ]
        current_fs = get_pairwise_freqs(cur_motif)
        sse = 0
        for w, f, cur_f in zip(ws, fs, current_fs):
            for b1, b2 in dinucs:
                delta = f[b1, b2] - cur_f[b1, b2]
                sse += delta**2
                w[b1, b2] += eta * (
                    delta)  #* exp(-iteration/float(decay_timescale))
            #sses[iteration/take_stock] = sse
        sse_per_col_pair = sse / choose(L, 2)
        print iteration, sse_per_col_pair, ws[0]['A', 'A']
        print "motif_ic:", motif_ic(cur_motif)
        if iteration > 0 and sse_per_col_pair < stop_crit:
            print "breaking:", sse, sse_per_col_pair
            break
        iteration += 1
    return ws
def spoof_motif_ref(motif,
                    num_motifs=10,
                    trials=10,
                    sigma=None,
                    Ne_tol=10**-4):
    n = len(motif)
    L = len(motif[0])
    copies = 10 * n
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1))
    print "sigma:", sigma
    bio_ic = motif_ic(motif)

    def f(Ne):
        ps = ps_from_copies(sigma, Ne, L, copies)
        motifs = [
            sample_motif(sigma, Ne, L, copies, n, ps=ps) for i in range(trials)
        ]
        return mean(map(motif_ic, motifs)) - bio_ic

    lb = 1
    ub = 2
    while f(ub) < 0:
        ub *= 2
    ub *= 2  # once more for good measure
    x0 = (lb + ub) / 2.0
    print "Ne guess:", x0
    Nes = [
        bisect_interval_noisy(f, x0=x0, tolerance=Ne_tol, lb=1)
        for i in range(3)
    ]
    Ne = mean(Nes)
    print "Nes:", Nes, Ne
    return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def main_experiment(motif_obj):
    """compare gini biological motifs to:
    (1) null ic-matched ensembles,and
    (2) sigma and IC matched evosims.
    Conduct evosims by matching sigma to pssm (fair?) and sweeping Ne in order to match IC.
    """
    evosim_trials = 10
    for tf in motif_obj.tfs:
        bio_motif = getattr(motif_obj, tf)
        n, L = len(bio_motif), len(bio_motif[0])
        bio_gini = motif_gini(bio_motif)
        bio_ic = motif_ic(bio_motif)
        bio_mi = total_motif_mi(bio_motif)
        ###############
        ### null ensemble stuff here
        ###############
        pssm = make_pssm(bio_motif)
        sigma = mean(map(sd, pssm))  # revisit this, see Djordjevic's paper
        # determine Ne
        Ne_ic = {}
        lo = 0
        hi = 5
        #Ne_ic[lo] =
        chain = sella_hirsch_mh_gr(matrix,
                                   Ne=5,
                                   iterations=1000,
                                   n=16,
                                   x0s=None)
        print "sigma:", sigma
        for trial in trange(evosim_trials):
            matrix = sample_matrix(L, sigma)
def moran_process(mean_rec_muts,
                  mean_site_muts,
                  N=1000,
                  turns=10000,
                  init=make_ringer2,
                  mutate=mutate,
                  fitness=fitness,
                  pop=None):
    site_mu = mean_site_muts / float(n * L)
    bd_mu = mean_rec_muts / float(L)
    if pop is None:
        pop = [(lambda spec: (spec, fitness(spec)))(init()) for _ in trange(N)]
    hist = []
    for turn in xrange(turns):
        fits = [f for (s, f) in pop]
        birth_idx = inverse_cdf_sample(range(N), fits, normalized=False)
        death_idx = random.randrange(N)
        #print birth_idx,death_idx
        mother, f = pop[birth_idx]
        daughter = mutate(mother, site_mu, bd_mu)
        #print "mutated"
        pop[death_idx] = (daughter, fitness(daughter))
        mean_fits = mean(fits)
        hist.append((f, mean_fits))
        if turn % 1000 == 0:
            mean_dna_ic = mean(
                [motif_ic(sites, correct=False) for ((bd, sites), _) in pop])
            print turn, "sel_fit:", f, "mean_fit:", mean_fits, "mean_dna_ic:", mean_dna_ic
    return pop, hist
def spoof_motifs_maxent(motif, num_motifs, verbose=False):
    n = len(motif)
    L = len(motif[0])
    des_ic = motif_ic(motif)
    if verbose:
        print "n: {} L: {} des_ic: {}".format(n, L, des_ic)
    return maxent_motifs_with_ic(n, L, des_ic, num_motifs, verbose=verbose)
def best_ic_motif(L,n,trials):
    best_ic = 0
    for i in trange(trials):
        motif = random_motif(L,n)
        cur_ic = motif_ic(motif,correct=False)
        if  cur_ic > best_ic:
            best_motif = motif
    return best_motif
def analyze_bio_motifs(Nes,trials=20):
    results = {}
    for tf_idx,tf in enumerate(Escherichia_coli.tfs):
        Ne = Nes[tf]
        bio_motif = getattr(Escherichia_coli,tf)
        n,L = len(bio_motif),len(bio_motif[0])
        bio_matrix = matrix_from_motif(bio_motif)
        sigma = sigma_from_matrix(bio_matrix)
        matrix_chains = [sella_hirsch_mh(n=n,L=L,sigma=sigma,Ne=Ne,init='ringer') for i in range(trials)]
        ics = [mean(map(motif_ic,chain[-1000:])) for (matrix,chain) in matrix_chains]
        ginis = [mean(map(motif_gini,chain[-1000:])) for (matrix,chain) in matrix_chains]
        mis = [mean(map(total_motif_mi,chain[-1000:])) for (matrix,chain) in matrix_chains]
        print "results for:",tf,tf_idx
        print motif_ic(bio_motif),mean(ics),sd(ics)
        print motif_gini(bio_motif),mean(ginis),sd(ginis)
        print total_motif_mi(bio_motif),mean(mis),sd(mis)
        results[tf] = (mean(ics),sd(ics),mean(ginis),sd(ginis),mean(mis),sd(mis))
    return results
Пример #28
0
def recognizer_non_linearity((sites,recognizer)):
    L = log(len(idx_of_word),4)
    motif = [w for w,i in idx_of_word.items() if recognizer[i]]
    if len(motif) == 0:
        return -1
    else:
        total_info = 2*L - log2(len(motif))
        col_info = motif_ic(motif,correct=False)
        return total_info - col_info
Пример #29
0
def spoof_motifs_uniform(motif, num_motifs, epsilon=0.1, verbose=False):
    n, L = len(motif), len(motif[0])
    desired_ic = motif_ic(motif)
    if verbose: print "starting spoof motifs uniform with:", n, L, desired_ic
    return uniform_motifs_accept_reject(n,
                                        L,
                                        desired_ic,
                                        num_motifs,
                                        epsilon,
                                        verbose=verbose)
def interpret_main_experiment(results_dict):
    taus = sorted(results_dict.keys())
    print taus
    data = [(tau,f,motif_ic(extract_sites(s)),total_motif_mi(extract_sites(s)))
            for tau in taus for (s,f) in results_dict[tau][0]]
    cols = transpose(data)
    names = "tau,f,motif_ic,total_motif_mi".split(",")
    for (i,name1),(j,name2) in choose2(list(enumerate(names))):
        xs = cols[i]
        ys = cols[j]
        print name1,name2,pearsonr(xs,ys),spearmanr(xs,ys)
def fit_motif(motif):
    n = len(motif)
    L = len(motif[0])
    bio_ic = motif_ic(motif)
    def f((sigma,Ne,copies)):
        return expected_ic(sigma,Ne,L,copies)-bio_ic
    def fsq((sigma,Ne,copies)):
        return f((sigma,Ne,copies))**2
    x0 = (1,2,n)
    fit_params = minimize(fsq,x0,bounds=((0,10),(1,None),(1,None)),method='TNC').x
    return fit_params
Пример #32
0
def main_experiment(samples=30, iterations=10000, delta_ic=0.1):
    results_dict = {}
    for tf_idx, tf in enumerate(tfdf.tfs):
        print "starting on:", tf
        motif = getattr(tfdf, tf)
        if motif_ic(motif) < 5:
            print "excluding", tf, "for low IC"
            continue
        bio_ic = motif_ic(motif)
        n = len(motif)
        L = len(motif[0])
        matrix = matrix_from_motif(motif)
        sigma = sigma_from_matrix(matrix)
        mu = approximate_mu(matrix, n, G)
        Ne = estimate_Ne(matrix, mu, n, bio_ic)
        spoofs = []
        ar = 0
        spoof_trials = 0.0
        while len(spoofs) < samples:
            spoof_trials += 1
            matrix, chain = sella_hirsch_mh(Ne=Ne,
                                            mu=mu,
                                            n=1,
                                            matrix=sample_matrix(L, sigma),
                                            init='ringer',
                                            iterations=iterations)
            spoof_motif = concat(
                [random.choice(chain[iterations / 2:]) for i in range(n)])
            if abs(motif_ic(spoof_motif) - bio_ic) < delta_ic:
                spoofs.append(spoof_motif)
                ar += 1
            print "spoof acceptance rate:", ar / spoof_trials, len(
                spoofs), samples, spoof_trials
        #spoofs = [chain[-1] for (spoof_matrix,chain,Ne) in [spoof_motif(motif,Ne) for i in range(samples)]]
        results_dict[tf] = {
            fname: map(eval(fname), spoofs)
            for fname in "motif_ic motif_gini total_motif_mi".split()
        }
        print "finished:", tf, "(%s/%s)" % (tf_idx, len(tfdf.tfs))
        print bio_ic, mean_ci(results_dict[tf]['motif_ic'])
    return results_dict
Пример #33
0
def motif_degradation_experiment():
    """what is the effect of repeatedly inferring a motif from selected sites?"""
    from motifs import Escherichia_coli
    motif = Escherichia_coli.LexA
    n = len(motif)
    matrix = matrix_from_motif(motif)
    assumed_copies = 10 * n
    mu = approximate_mu(matrix, assumed_copies, G)
    for i in range(10):
        print i, "motif ic:", motif_ic(motif)
        motif = select_sites_by_occupancy(matrix, mu, n)
        matrix = matrix_from_motif(motif)
Пример #34
0
def interpret_main_experiment(results_dict):
    taus = sorted(results_dict.keys())
    print taus
    data = [(tau, f, motif_ic(extract_sites(s)),
             total_motif_mi(extract_sites(s))) for tau in taus
            for (s, f) in results_dict[tau][0]]
    cols = transpose(data)
    names = "tau,f,motif_ic,total_motif_mi".split(",")
    for (i, name1), (j, name2) in choose2(list(enumerate(names))):
        xs = cols[i]
        ys = cols[j]
        print name1, name2, pearsonr(xs, ys), spearmanr(xs, ys)
Пример #35
0
def plot_mono_vs_di_likelihood(ll_dict = None):
    if ll_dict is None:
        ll_dict = likelihood_dict()
    normed_dict = {tf:tuple(map(lambda x:x/float(len(getattr(Escherichia_coli,tf))*len(getattr(Escherichia_coli,tf)[0])),(mono,di))) for (tf,(mono,di)) in ll_dict.items()}
    plt.scatter(*transpose(ll_dict.values()))
    for (tf,(mono,di)) in ll_dict.items():
        sites = getattr(Escherichia_coli,tf)
        text = "%s\n#:%s\nw:%s\nIC:%1.2f" % (tf,len(sites),len(sites[0]),motif_ic(sites))
        plt.annotate(text,(mono,di))
    min_val = min(concat(ll_dict.values()))
    max_val = max(concat(ll_dict.values()))
    plt.xlabel("Mono LL")
    plt.ylabel("Di LL")
    plt.plot([min_val,max_val],[min_val,max_val],linestyle="--")
def motif_ls_sq_surface_experiment():
    motif = Escherichia_coli.LexA
    L = len(motif[0])
    bio_ic = motif_ic(motif)
    sigma,Ne,copies = fit_motif(motif)
    fit_ic = expected_ic(sigma,Ne,L,copies)
    f = lambda (sigma,Ne,copies):expected_ic(sigma,Ne,L,copies)
    gr = compute_grad(f,(sigma,Ne,copies),epsilon=0.0001)
    hess = compute_hessian(f,(sigma,Ne,copies))
    lambs, vs = np.linalg.eig(hess)
    def orth_comp(y,z):
        a,b,c = gr
        x = -(b*y+c*z)/a
        return np.array([x,y,z])
def interpret_estremo_chain(chain, mu=-10, Ne=5):
    nu = Ne - 1

    def log_f((code, motif)):
        eps = map(lambda x: -log(x), pw_prob_sites(motif, code))
        return sum(nu * log(1 / (1 + exp(ep - mu))) for ep in eps)

    spoofs = [spoof_maxent_motifs(motif, 10) for code, motif in tqdm(chain)]
    plt.plot([motif_ic(motif) for (code, motif) in tqdm(chain)])
    plt.plot([motif_mi(motif) for (code, motif) in tqdm(chain)])
    plt.plot([mean(map(motif_ic, motifs)) for motifs in tqdm(spoofs)])
    plt.plot([mean(map(motif_mi, motifs)) for motifs in tqdm(spoofs)])
    plt.plot([indep_measure(code) for (code, motif) in tqdm(chain)])
    plt.plot(map(log_f, chain))
Пример #38
0
def spoof_motif(motif, T):
    n = len(motif)
    L = len(motif[0])
    bio_ic = motif_ic(motif)
    sigma = 2 * mean(map(sd, make_pssm(motif)))  # XXX REVSIT THIS ISSUE
    ic_from_Ne = lambda Ne: predict_stat(n,
                                         L,
                                         sigma,
                                         Ne,
                                         G=5 * 10**6,
                                         T=lambda rho: mean_ic_from_rho(
                                             rho, n, L))
    Ne = bisect_interval(lambda Ne: ic_from_Ne(Ne) - bio_ic, 0.01, 5)
    return predict_stat(n, L, sigma, Ne, T)
Пример #39
0
def collapsed_moran_process(N,turns,init=sample_species,mutate=mutate,fitness=fitness,ancestor=None,modulus=100):
    if ancestor is None:
        ancestor = sample_species()
    f = fitness(ancestor)
    hist = []
    for turn in xrange(turns):
        prop = mutate(ancestor)
        fp = fitness(prop)
        if f == fp:
            continue
        num = (1-f/fp)
        denom = (1-(f/fp)**N)
        transition_prob = num/denom
        # print f,fp
        # print num,denom
        # print transition_prob
        if random.random() < transition_prob:
            ancestor = prop
            f = fp
        if turn % modulus == 0:
            print (turn,f,f,motif_ic(ancestor[0],correct=False),rec_h(ancestor[1]))
            hist.append((turn,f,f,motif_ic(ancestor[0]),rec_h(ancestor[1])))
    return ancestor,hist
def basic_statistics(tfdf=None,filename="basic_motif_statistics.png"):
    if tfdf is None:
        tfdf = extract_motif_object_from_tfdf()
    motifs = [getattr(tfdf,tf) for tf in tfdf.tfs]
    Ls = [len(motif[0]) for motif in motifs]
    ns = [len(motif) for motif in motifs]
    ics = [motif_ic(motif) for motif in motifs]
    ic_density = [ic/L for ic,L in zip(ics,Ls)]
    sigmas = [mean(map(sd,make_pssm(motif))) for motif in motifs]
    ginis = [motif_gini(motif,correct=False) for motif in motifs]
    mi_density = [total_motif_mi(motif)/choose(L,2) for motif,L in zip(motifs,Ls)]
    plt.subplot(2,3,1)
    #plt.tick_params(axis='x',pad=15)
    plt.xticks(rotation=90)
    plt.hist(Ls)
    plt.xlabel("Length (bp)")
    
    plt.subplot(2,3,2)
    #plt.tick_params(axis='x',pad=30)
    plt.xticks(rotation=90)
    plt.hist(ns)
    plt.xlabel("Number of sites")

    plt.subplot(2,3,3)
    plt.hist(ics)
    plt.xticks(rotation=90)
    plt.xlabel("IC (bits)")

    plt.subplot(2,3,4)
    #plt.tick_params(axis='x',pad=30)
    plt.xticks(rotation=90)
    plt.hist(ic_density)
    plt.xlabel("IC Density (bits/bp)")

    plt.subplot(2,3,5)
    #plt.tick_params(axis='x',pad=15)
    plt.xticks(rotation=90)
    plt.hist(ginis)
    plt.xlabel("Gini coeff")

    plt.subplot(2,3,6)
    #plt.tick_params(axis='x',pad=30)
    plt.xticks(rotation=90)
    plt.hist(mi_density)
    plt.xlabel("MI Density (bits/comparison)")
    plt.tight_layout()
    if filename:
        plt.savefig(filename,dpi=600)
    plt.close()
Пример #41
0
def test_predict_ic(trials=100):
    pred_ics = []
    obs_ics = []
    for trial in trange(trials):
        sigma = random.random() * 5 + 0.1
        L = random.randrange(5, 15)
        matrix = sample_matrix(L, sigma)
        mu = random.random() * (-20)
        Ne = random.random() * 5 + 1
        pred_ic = predict_ic(matrix, mu, Ne)
        obs_ic = motif_ic(sample_motif_cftp(matrix, mu, Ne, n=100))
        pred_ics.append(pred_ic)
        obs_ics.append(obs_ic)
    r, p = scatter(pred_ics, obs_ics)
    print r, p
Пример #42
0
def sample_motifs_evo_ic(motif, iterations=1000, verbose=False, theta=None):
    N = len(motif)
    L = len(motif[0])
    des_ic = motif_ic(motif)
    chain = evo_ic_sample_motif2(N,
                                 L,
                                 des_ic,
                                 iterations=iterations,
                                 verbose=False,
                                 theta=theta)
    motifs = [
        sample_motif_cftp(sample_matrix(L, sigma), mu, Ne, N)
        for (sigma, mu, Ne) in tqdm(chain)
    ]
    return chain, motifs
def test():
    L = 10
    matrix = [[-1,0,0,0] for i in range(L)]
    ringer_site = "A"*L
    n = 10
    trials = 10
    mus = range(-9,1,1)
    Nes = range(2,11,1)
    ics = [[mean(motif_ic(sample_motif(matrix, show(mu), Ne, ringer_site, n))
                 for i in range(trials)) for mu in tqdm(mus)]
           for Ne in Nes]
    plt.contour(mus,Nes,ics)
    plt.colorbar(label="IC")
    plt.xlabel("Mu")
    plt.ylabel("Nes")
def find_beta(N, L, des_ic, iterations=100, verbose=False):
    #des_ic_per_col = des_ic / float(L)
    beta = 1
    alpha = 1.0
    beta_hist = []
    ic_hist = []
    for i in range(1,iterations+1):
        #beta = beta + alpha/i * (des_ic_per_col - motif_ic(rmotif(N, 1, beta)))
        obs_ic = motif_ic(rmotif(N, L, beta))
        beta_hist.append(beta)
        ic_hist.append(obs_ic)
        beta = beta + alpha/i * (des_ic - obs_ic)
        if verbose:
            print i, beta, obs_ic
    return beta, beta_hist, ic_hist
def plot_results_dict_gini_vs_ic(results_dict,filename=None):
    for i,k in enumerate(results_dict):
        g1,g2,tf = k.split("_")
        genome = g1 + "_" + g2
        bio_motif = extract_tfdf_sites(genome,tf)
        bio_ic = motif_ic(bio_motif)
        bio_gini = motif_gini(bio_motif)
        d = results_dict[k]
        plt.scatter(bio_ic,bio_gini,color='b',label="Bio"*(i==0))
        plt.scatter(mean(d['maxent']['motif_ic']),mean(d['maxent']['motif_gini']),color='g',label='ME'*(i==0))
        plt.scatter(mean(d['uniform']['motif_ic']),mean(d['uniform']['motif_gini']),color='r',label="TURS"*(i==0))
    plt.xlabel("IC (bits)")
    plt.ylabel("Gini Coefficient")
    plt.legend()
    maybesave(filename)
Пример #46
0
def test_predict_ic(trials=100):
    pred_ics = []
    obs_ics = []
    for trial in trange(trials):
        sigma = random.random() * 5 + 0.1
        L = random.randrange(5, 15)
        matrix = sample_matrix(L, sigma)
        mu = random.random() * (-20)
        Ne = random.random() * 5 + 1
        pred_ic = predict_ic(matrix, mu, Ne)
        obs_ic = motif_ic(sample_motif_cftp(matrix, mu, Ne, n=100))
        pred_ics.append(pred_ic)
        obs_ics.append(obs_ic)
    r, p = scatter(pred_ics, obs_ics)
    print r, p
def spoof_motifs(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10*N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    epsilon = (1+double_sigma)*sigma # 15 Jan 2016
    print "sigma:", sigma
    bio_ic = motif_ic(motif)
    def f(Ne):
        ps = ps_from_copies(sigma, Ne, L, copies)
        motifs = [sample_motif(epsilon, Ne, L, copies, n,ps=ps)
                  for i in range(trials)]
        return mean(map(motif_ic,motifs)) - bio_ic
    Ne = log_regress_spec2(f,[1,10],tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, n) for _ in range(num_motifs)]
def extract_motif_object_from_tfdf():
    obj = Organism()
    genomes = set(tfdf['genome_accession'])
    setattr(obj,"tfs",[])
    for genome in genomes:
        tfs = set(tfdf[tfdf['genome_accession'] == genome]['TF'])
        for tf in tfs:
            print tf,genome
            if not type(tf) is str:
                continue
            tf_name = genome + "_" + tf
            sites = extract_tfdf_sites(genome,tf)
            if len(sites) >= 10 and motif_ic(sites) > 5:
                setattr(obj,tf_name,sites)
                obj.tfs.append(tf_name)
    return obj
def rfreq_rseq_experiment(obj,filename="rfreq_vs_rseq_in_sefas_collection.png"):
    Rfreqs = []
    Rseqs = []
    G = 5.0*10**6
    min_rfreq = log2(G/500)
    for tf in obj.tfs:
        motif = getattr(obj,tf)
        Rfreqs.append(log(G/len(motif),2))
        Rseqs.append(motif_ic(motif))
    plt.scatter(Rfreqs,Rseqs)
    plt.xlabel("log(G/n) (bits)")
    plt.ylabel("Motif Information Content (bits)")
    plt.plot([0,20],[0,20],linestyle='--',label='Theory')
    plt.plot([min_rfreq,min_rfreq],[0,30],linestyle='--',label='Maximum Plausible Regulon Size')
    plt.title("Motif Information Content vs. Search Difficulty")
    plt.legend(loc='upper left')
    maybesave(filename)
def make_gle_evo_sim_spoofs(bio_motifs, trials_per_motif = 3):
    start_time = time.time()
    spoofs = []
    failures = 0
    for it, motif in enumerate(tqdm(bio_motifs, desc='bio_motifs')):
        bio_ic = motif_ic(motif)
        these_spoofs = [spoof_motif_gle(motif,num_motifs=10, Ne_tol=10**-2)
                        for i in range(trials_per_motif)]
        spoofs.append(these_spoofs)
        spoof_ics = map(motif_ic, concat(these_spoofs))
        lb, ub = mean_ci(spoof_ics)
        out_of_bounds = (not (lb <= bio_ic <= ub))
        failures += out_of_bounds
        fail_perc = failures/float(it+1)
        print it,"bio_ic:", bio_ic, "spoof_ci: (%s,%s)" % (lb, ub), "*" * out_of_bounds,"failures:","%1.2f" % fail_perc
    stop_time = time.time()
    print "total time:", stop_time  - start_time
    return spoofs
def results_of_analyze_bio_motifs(results):
    # IC
    Ls = np.array([len(getattr(Escherichia_coli,tf)[0]) for tf in Escherichia_coli.tfs])
    Ls_choose_2 = np.array([choose(L,2) for L in Ls])
    bio_ics = np.array([motif_ic(getattr(Escherichia_coli,tf)) for tf in Escherichia_coli.tfs])
    sim_ics = np.array([results[tf][0] for tf in Escherichia_coli.tfs])
    sim_ic_errs = np.array([1.96*results[tf][1] for tf in Escherichia_coli.tfs])
    bio_ics_norm = bio_ics/Ls
    sim_ics_norm = sim_ics/Ls
    sim_ic_norm_errs = sim_ic_errs/Ls
    bio_ginis = np.array([motif_gini(getattr(Escherichia_coli,tf)) for tf in Escherichia_coli.tfs])
    sim_ginis = np.array([results[tf][2] for tf in Escherichia_coli.tfs])
    sim_gini_errs = np.array([1.96*results[tf][3] for tf in Escherichia_coli.tfs])
    bio_mis_norm = np.array([total_motif_mi(getattr(Escherichia_coli,tf))/choose(L,2)
                    for tf,L in zip(Escherichia_coli.tfs,Ls)])
    sim_mis_norm = np.array([results[tf][4]/choose(L,2) for tf,L in zip(Escherichia_coli.tfs,Ls)])
    sim_mis_norm_errs = np.array([1.96*results[tf][5]/choose(L,2) for tf,L in zip(Escherichia_coli.tfs,Ls)])
    plt.subplot(1,4,1)
    plt.errorbar(bio_ics,sim_ics,
                  yerr=sim_ic_errs,fmt='o')
    plt.plot([0,20],[0,20])
    plt.xlabel("IC")
    
    plt.subplot(1,4,2)
    plt.errorbar(bio_ics_norm,sim_ics_norm,
                  yerr=sim_ic_norm_errs,fmt='o')
    plt.plot([0,2],[0,2])
    plt.xlabel("IC/base")
    
    plt.subplot(1,4,3)
    plt.errorbar(bio_ginis,sim_ginis,
                yerr=sim_gini_errs,fmt='o')
    plt.plot([0,1],[0,1])
    plt.xlabel("Gini coefficient")
    
    plt.subplot(1,4,4)
    plt.errorbar(bio_mis_norm,sim_mis_norm,
                 yerr=sim_mis_norm_errs,fmt='o')
    plt.plot([0,0.5],[0,0.5])
    plt.xlabel("MI/pair")
    print "IC:", pearsonr(bio_ics, sim_ics)
    print "normalized IC:", pearsonr(bio_ics_norm, sim_ics_norm)
    print "Gini:", pearsonr(bio_ginis, sim_ginis)
    print "normalized MI:", pearsonr(bio_mis_norm, sim_mis_norm)
def tfdf_experiment(replicates=1000,delta_ic=0.1,tolerance=10**-5):
    genomes = set(tfdf['genome_accession'])
    results_dict = defaultdict(lambda:defaultdict(dict))
    for genome_idx,genome in enumerate(genomes):
        print "genome:",genome, genome_idx,len(genomes)
        tfs = set(tfdf[tfdf['genome_accession'] == genome]['TF'])
        for tf_idx,tf in enumerate(tfs):
            if not type(tf) is str:
                continue
            print "tf:",tf,tf_idx,len(tfs)
            print genome,tf
            bio_motif = extract_tfdf_sites(genome,tf)
            if len(bio_motif) < 10:
                print "skipping:"
                continue
            tf_name = genome + "_" + tf
            n,L = motif_dimensions(bio_motif)
            print "dimensions:",n,L
            bio_ic = motif_ic(bio_motif)
            bio_gini = motif_gini(bio_motif)
            bio_mi = total_motif_mi(bio_motif)
            results_dict[tf_name]["bio"]["motif_ic"] = bio_ic
            results_dict[tf_name]["bio"]["motif_gini"] = bio_gini
            results_dict[tf_name]["bio"]["total_motif_mi"] = bio_mi
            correction_per_col = 3/(2*log(2)*n)
            desired_ic = bio_ic + L * correction_per_col
            t = time.time()
            beta = find_beta_for_mean_motif_ic(n,L,desired_ic,tolerance=tolerance)
            beta_time = time.time() - t
            print "beta, time:",beta, beta_time
            print "maxent sampling"
            maxent = maxent_motifs_with_ic(n,L,bio_ic,replicates,beta=beta)
            print "uniform sampling"
            uniform = uniform_motifs_with_ic(n,L,bio_ic,replicates,epsilon=delta_ic,beta=beta)
            #print "envelope sampling"
            #envelope = envelope_sample_motifs_with_ic(n,L,bio_ic,delta_ic,replicates,beta=beta)
            for spoof_name in "maxent uniform".split():
                spoofs = eval(spoof_name)
                for motif_statname in "motif_ic motif_gini total_motif_mi".split():
                    print "recording results for:",spoof_name,motif_statname
                    motif_stat = eval(motif_statname)
                    results_dict[tf_name][spoof_name][motif_statname] = map(motif_stat,spoofs)
    return results_dict
def spoof_motif_ar(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4):
    n = len(motif)
    L = len(motif[0])
    copies = 10*n
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    print "sigma:", sigma
    bio_ic = motif_ic(motif)
    matrix = sample_matrix(L, sigma)
    mu = approx_mu(matrix, copies=10*n, G=5*10**6)
    print "mu:", mu
    def f(Ne):
        motifs = [sample_motif_ar(matrix, mu, Ne, n)
                  for i in trange(trials)]
        return mean(map(motif_ic,motifs)) - bio_ic
    x0 = 2
    print "Ne guess:", x0
    Ne = bisect_interval_noisy(f,x0=x0,iterations=100,lb=1, verbose=False,w=0.5)
    print "Ne:",Ne
    return [sample_motif_ar(matrix, mu, Ne, n) for _ in trange(num_motifs)]
def Ne_from_motif(bio_motif,interp_rounds,iterations=50000):
    """Given a motif, return Ne that matches mean IC"""
    bio_ic = motif_ic(bio_motif)
    n = len(bio_motif)
    L = len(bio_motif[0])
    matrix = [[-ep for ep in row] for row in  make_pssm(bio_motif)]
    print len(matrix)
    def f(Ne,iterations=iterations):
        print "Ne",Ne
        _,chain = sella_hirsch_mh(matrix=matrix,n=n,Ne=Ne,iterations=iterations,init='ringer')
        return mean(map(motif_ic,chain[iterations/2:])) - bio_ic
    # lo,hi = 1,5
    # data = []
    # for _ in xrange(interp_rounds):
    #     guess = (lo + hi)/2.0
    #     y = f(guess)
    #     print lo,hi,guess,y
    #     data.append((guess,y))
    #     if y > 0:
    #         hi = guess
    #     else:
    #         lo = guess
    # return data
    Ne_min = 1
    Ne_max = 5
    while f(Ne_max) < 0:
        print "increasing Ne max"
        Ne_max *= 2
    xs, ys= transpose([(Ne,f(Ne)) for Ne in np.linspace(Ne_min,Ne_max,interp_rounds)])
    # now find an interpolant.  We desire smallest sigma of gaussian
    # interpolant such that function has at most one inflection point
    interp_sigmas = np.linspace(0.01,1,100)
    interps = [gaussian_interp(xs,ys,sigma=s) for s in interp_sigmas]
    for i,(sigma, interp) in enumerate(zip(interp_sigmas,interps)):
        print i,sigma
        if num_inflection_points(map(interp,np.linspace(Ne_min,Ne_max,100))) == 1:
            "found 1 inflection point"
            break
    print sigma
    Ne = bisect_interval(interp,Ne_min,Ne_max)
    return Ne
def moran_process(N=1000,turns=10000,init=sample_species,mutate=mutate,fitness=fitness,pop=None):
    if pop is None:
        pop = [(lambda spec:(spec,fitness(spec)))(sample_species())
               for _ in trange(N)]
    hist = []
    for turn in xrange(turns):
        fits = [f for (s,f) in pop]
        #print fits
        birth_idx = inverse_cdf_sample(range(N),fits,normalized=False)
        death_idx = random.randrange(N)
        #print birth_idx,death_idx
        mother,f = pop[birth_idx]
        daughter = mutate(mother)
        #print "mutated"
        pop[death_idx] = (daughter,fitness(daughter))
        mean_fits = mean(fits)
        hist.append((f,mean_fits))
        if turn % 10 == 0:
            mean_dna_ic = mean([motif_ic(sites,correct=False) for ((sites,eps),_) in pop])
            mean_rec_h = mean([h_np(boltzmann(eps)) for ((dna,eps),_) in pop])
            print turn,"sel_fit:",f,"mean_fit:",mean_fits,"mean_dna_ic:",mean_dna_ic,"mean_rec_h:",mean_rec_h
    return pop