示例#1
0
def excess_mi_experiment(filename=None):
    """Do artificial motifs with linear BEMs show the same patterns of excess MI as biological motifs? (Yes)"""
    n = 10
    L = 10
    G = 1000
    desired_ic = 10
    replicates = 1000
    ics = np.array(
        [mean_ic_from_eps(eps, n, L) for eps in enumerate_eps(n, L)])

    def mean_ic(N):
        ps = sella_hirsch_predictions(n, L, G, N)
        return ics.dot(ps)

    Ne = secant_interval(lambda N: mean_ic(N) - desired_ic,
                         0,
                         2000,
                         tolerance=0.1,
                         verbose=True)  # ~= 1525
    ps = sella_hirsch_predictions(n, L, G, Ne)
    sh_sampler = inverse_cdf_sampler(list(enumerate_eps(n, L)), ps)
    sh_motifs = [
        sample_motif_from_mismatches(sh_sampler(), L)
        for i in trange(replicates)
    ]
    sh_mean_ic = mean(map(
        motif_ic, sh_motifs))  # may undershoot desired due to approximation
    maxent_motifs = maxent_sample_motifs_with_ic(n, L, sh_mean_ic, replicates)
    plt.suptitle(
        "Motif Statistics for Match/Mismatch Model vs. MaxEnt Ensembles (n=10,L=10,G=1000)"
    )
    all_boxplot_comparisons([sh_motifs, maxent_motifs],
                            labels=["M/MM", "MaxEnt"],
                            plot_titles="IC Gini MI".split(),
                            filename=filename)
示例#2
0
def uniform_motifs_accept_reject(n,
                                 L,
                                 desired_ic,
                                 num_motifs,
                                 epsilon=0.1,
                                 beta=None,
                                 verbose=False):
    if beta is None:
        correction_per_col = 3 / (2 * log(2) * n)
        desired_ic_for_beta = desired_ic + L * correction_per_col
        beta = find_beta_for_mean_motif_ic(n,
                                           L,
                                           desired_ic_for_beta,
                                           verbose=verbose)
    ps = count_ps_from_beta(n, beta)
    count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)
    return [
        uniform_motif_accept_reject(n,
                                    L,
                                    desired_ic,
                                    epsilon=epsilon,
                                    beta=beta,
                                    ps=ps,
                                    count_sampler=count_sampler,
                                    verbose=verbose)
        for i in trange(num_motifs)
    ]
示例#3
0
def uniform_motif_accept_reject(n,
                                L,
                                desired_ic,
                                epsilon=0.1,
                                beta=None,
                                ps=None,
                                count_sampler=None,
                                verbose=False):
    print "uniform motif accept reject:", n, L, desired_ic, beta
    correction_per_col = 3 / (2 * log(2) * n)
    desired_ic_for_beta = desired_ic + L * correction_per_col
    if desired_ic_for_beta == 2 * L:  # if we reach the upper limit, things break down
        cols = [sample_col_from_count((0, 0, 0, n)) for _ in range(L)]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p
    if beta is None:
        beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta)
        if verbose:
            print "beta:", beta
    if ps is None:
        ps = count_ps_from_beta(n, beta)
    if count_sampler is None:
        count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)

    def rQ_raw():
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p

    def rQ():
        return sample_until(lambda M: inrange(M, desired_ic, epsilon),
                            rQ_raw,
                            1,
                            progress_bar=False)[0]

    def dQhat(motif):
        return exp(beta * motif_ic(motif))

    Imin = desired_ic - epsilon
    Imax = desired_ic + epsilon
    log_M = -beta * Imin
    if verbose: print "Imin, Imax, log_M:", Imin, Imax, log_M

    def dQ(motif):
        return exp(beta * motif_ic(motif) + log_M)

    def AR(motif):
        return 1.0 / dQ(motif)

    #M = exp(-beta*(desired_ic - epsilon)) # which ic? +/- correction
    trials = 0
    while True:
        trials += 1
        motif = rQ()
        r = random.random()
        if r < AR(motif):
            return motif
        if verbose and trials % 100 == 0:
            print trials, AR(motif)
def maxent_motifs_with_ic(n,
                          L,
                          desired_ic,
                          num_motifs,
                          tolerance=10**-10,
                          beta=None,
                          verbose=False):
    if beta is None:
        correction_per_col = 3 / (2 * log(2) * n)
        desired_ic += L * correction_per_col
        beta = find_beta_for_mean_motif_ic(n,
                                           L,
                                           desired_ic,
                                           tolerance=tolerance,
                                           verbose=verbose)
        if verbose:
            print "beta:", beta
    ps = count_ps_from_beta(n, beta)
    count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)

    def sample():
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        return map(lambda site: "".join(site), transpose(cols))

    return [sample() for _ in trange(num_motifs)]
示例#5
0
def uniform_motif_with_ic_imh_ref(n,
                                  L,
                                  desired_ic,
                                  epsilon=0.1,
                                  iterations=None,
                                  verbose=False,
                                  num_chains=8):
    correction_per_col = 3 / (2 * log(2) * n)
    desired_ic_for_beta = desired_ic + L * correction_per_col
    beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta)
    ps = count_ps_from_beta(n, beta)
    count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)

    def Q(motif):
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p

    def log_dQ(motif_p, motif):
        return (beta * motif_ic(motif_p))

    def log_f(motif):
        in_range = abs(motif_ic(motif) - desired_ic) < epsilon
        return 0 if in_range else -10.0**100

    if iterations:
        x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0]
        chain = mh(log_f,
                   proposal=Q,
                   dprop=log_dQ,
                   x0=x0,
                   iterations=iterations,
                   use_log=True,
                   verbose=False)
        return chain
    else:  #use gelman rubin criterion
        x0s = sample_until(lambda x: log_f(x) > -1, lambda: Q(None),
                           num_chains)
        iterations = 100
        converged = False
        chains = [[] for _ in range(num_chains)]
        while not converged:
            for chain, x0 in zip(chains, x0s):
                chain.extend(
                    mh(log_f,
                       proposal=Q,
                       dprop=log_dQ,
                       x0=x0,
                       iterations=iterations,
                       use_log=True,
                       verbose=False))
            ic_chains = mmap(motif_ic, chains)
            R_hat, neff = gelman_rubin(ic_chains)
            if R_hat < 1.1:
                return chains
            else:
                x0s = [chain[-1] for chain in chains]
                iterations *= 2
示例#6
0
def uniform_motif_with_ic_imh(n,
                              L,
                              desired_ic,
                              epsilon=0.1,
                              iterations=None,
                              verbose=False,
                              beta=None,
                              num_chains=8):
    if beta is None:
        correction_per_col = 3 / (2 * log(2) * n)
        desired_ic_for_beta = desired_ic + L * correction_per_col
        beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta)
    ps = count_ps_from_beta(n, beta)
    count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)

    def Q(motif):
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p

    def log_dQ(motif_p, motif):
        return (beta * motif_ic(motif_p))

    def log_f(motif):
        in_range = abs(motif_ic(motif) - desired_ic) < epsilon
        return 0 if in_range else -10.0**100

    x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0]
    # first, determine probability of landing in range
    ar = 0
    iterations = 100
    while ar == 0:
        ar = mh(log_f,
                proposal=Q,
                dprop=log_dQ,
                x0=x0,
                iterations=iterations,
                use_log=True,
                verbose=False,
                return_ar=True)
        iterations *= 2
    iterations = int(1.0 / ar * 10)
    chain = mh(log_f,
               proposal=Q,
               dprop=log_dQ,
               x0=x0,
               iterations=iterations,
               use_log=True,
               verbose=False)
    return chain
示例#7
0
def uniform_motif_imh_tv(n, L, desired_ic, beta=None, epsilon=None, tv=0.01):
    """run uniform imh to within total variation bound tv"""
    correction_per_col = 3 / (2 * log(2) * n)
    desired_ic_for_beta = desired_ic + L * correction_per_col
    if beta == None:
        beta = find_beta_for_mean_motif_ic(n, L, desired_ic_for_beta)
    if epsilon == None:
        epsilon = 1.0 / (2 * beta)
        print "maximally efficient epsilon:", epsilon
    ps = count_ps_from_beta(n, beta)
    count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)

    def Qp(motif):
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p

    def Q(motif):
        return sample_until(lambda m: abs(motif_ic(m) - desired_ic) < epsilon,
                            lambda: Qp(None), 1)[0]

    def log_dQ(motif_p, motif):
        return (beta * motif_ic(motif_p))

    def log_f(motif):
        in_range = abs(motif_ic(motif) - desired_ic) < epsilon
        return 0 if in_range else -10.0**100

    alpha = exp(-2 * beta * epsilon)
    iterations = int(ceil(log(tv) / log(1 - alpha)))
    print "iterations:", iterations
    x0 = sample_until(lambda x: log_f(x) > -1, lambda: Q(None), 1)[0]
    # first, determine probability of landing in range
    chain = mh(log_f,
               proposal=Q,
               dprop=log_dQ,
               x0=x0,
               iterations=iterations,
               use_log=True,
               verbose=False)
    return chain
def maxent_motif_with_ic(n,
                         L,
                         desired_ic,
                         tolerance=10**-10,
                         beta=None,
                         verbose=False):
    """sample motif from max ent distribution with mean desired_ic"""
    # first we adjust the desired ic upwards so that when motif_ic is
    # called with 1st order correction, we get the desired ic.
    if beta is None:
        if verbose:
            print "finding beta"
        correction_per_col = 3 / (2 * log(2) * n)
        desired_ic += L * correction_per_col
        beta = find_beta_for_mean_motif_ic(n,
                                           L,
                                           desired_ic,
                                           tolerance=tolerance,
                                           verbose=verbose)
    ps = count_ps_from_beta(n, beta)
    count_sampler = inverse_cdf_sampler(enumerate_counts(n), ps)
    counts = [count_sampler() for i in range(L)]
    cols = [sample_col_from_count(count) for count in counts]
    return map(lambda site: "".join(site), transpose(cols))