예제 #1
0
def motif_degradation_experiment():
    """what is the effect of repeatedly inferring a motif from selected sites?"""
    from motifs import Escherichia_coli
    motif = Escherichia_coli.LexA
    n = len(motif)
    matrix = matrix_from_motif(motif)
    assumed_copies = 10 * n
    mu = approximate_mu(matrix, assumed_copies, G)
    for i in range(10):
        print i, "motif ic:", motif_ic(motif)
        motif = select_sites_by_occupancy(matrix, mu, n)
        matrix = matrix_from_motif(motif)
예제 #2
0
def restriction_of_range_half_site_experiment(motif):
    """is energy of first half-site negatively correlated with energy of second half-site?"""
    L = len(motif[0])
    l = L/2
    mat = matrix_from_motif(motif)
    eps1 = [score_seq(mat[:l], site[:l]) for site in motif]
    eps2 = [score_seq(mat[l:], site[l:]) for site in motif]
    return pearsonr(eps1,eps2)
예제 #3
0
def restriction_of_range_motif_spoof_experiment(motifs):
    all_eps = []
    all_spoof_eps = []
    for motif in tqdm(motifs):
        mat = matrix_from_motif(motif)
        eps = [score_seq(mat, site) for site in motif]
        spoofs = spoof_psfm(motif, pc=0)
        spoof_eps = [score_seq(mat, site) for site in spoofs]
        all_eps.append(eps)
        all_spoof_eps.append(spoof_eps)
    return all_eps, all_spoof_eps
예제 #4
0
def spoof_motif(motif, Ne=None, iterations=10000):
    matrix = matrix_from_motif(motif)
    L = len(motif[0])
    n = len(motif)
    sigma = sigma_from_matrix(matrix)
    spoof_matrix = sample_matrix(L, sigma)
    bio_ic = motif_ic(motif)
    # this method of reading site_mu, site_sigma off of motif is slightly suspect...
    site_mu = site_mu_from_matrix(matrix_from_motif(motif))
    site_sigma = site_sigma_from_matrix(matrix_from_motif(motif))
    # now need to find mu, nu
    n = len(motif)
    assumed_copies = 10 * n
    mu = approximate_mu(matrix, assumed_copies, G)
    spoof_mu = approximate_mu(spoof_matrix, assumed_copies, G)
    if Ne is None:
        Ne = estimate_Ne(spoof_matrix, spoof_mu, n, bio_ic)
        print "chose Ne:", Ne
    spoof_matrix, chain = sella_hirsch_mh(Ne=Ne,
                                          matrix=spoof_matrix,
                                          mu=mu,
                                          n=n)
    return spoof_matrix, chain, Ne
예제 #5
0
def restriction_of_range_loo_experiment(motif):
    """can energy of a given position be predicted from energy of remaining bases?"""
    L = len(motif[0])
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat,site) for site in motif]
    mean_ep = mean(eps)
    results = []
    for j in range(L):
        print j
        loo_mat = mat[:j] + mat[j+1:]
        for site in motif:
            loo_ep = score_seq(loo_mat,site[:j] + site[j+1:])
            pred_ep = mean_ep - loo_ep
            obs_ep = score_seq([mat[j]],[site[j]])
            results.append((pred_ep, obs_ep))
    return results
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10*N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1))
    epsilon = (1+double_sigma)*sigma # 15 Jan 2016
    print "sigma:", sigma
    #bio_ic = motif_ic(motif)
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat, site) for site in motif]
    mu = gle_approx_mu(mat, copies)
    bio_occ = mean([1/(1+exp(ep-mu)) for ep in eps])
    def f(Ne):
        return expected_occupancy(epsilon, Ne, L, copies) - bio_occ
    Ne = log_regress_spec2(f,[1,10],tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
def analyze_bio_motifs(Nes,trials=20):
    results = {}
    for tf_idx,tf in enumerate(Escherichia_coli.tfs):
        Ne = Nes[tf]
        bio_motif = getattr(Escherichia_coli,tf)
        n,L = len(bio_motif),len(bio_motif[0])
        bio_matrix = matrix_from_motif(bio_motif)
        sigma = sigma_from_matrix(bio_matrix)
        matrix_chains = [sella_hirsch_mh(n=n,L=L,sigma=sigma,Ne=Ne,init='ringer') for i in range(trials)]
        ics = [mean(map(motif_ic,chain[-1000:])) for (matrix,chain) in matrix_chains]
        ginis = [mean(map(motif_gini,chain[-1000:])) for (matrix,chain) in matrix_chains]
        mis = [mean(map(total_motif_mi,chain[-1000:])) for (matrix,chain) in matrix_chains]
        print "results for:",tf,tf_idx
        print motif_ic(bio_motif),mean(ics),sd(ics)
        print motif_gini(bio_motif),mean(ginis),sd(ginis)
        print total_motif_mi(bio_motif),mean(mis),sd(mis)
        results[tf] = (mean(ics),sd(ics),mean(ginis),sd(ginis),mean(mis),sd(mis))
    return results
예제 #8
0
def main_experiment(samples=30, iterations=10000, delta_ic=0.1):
    results_dict = {}
    for tf_idx, tf in enumerate(tfdf.tfs):
        print "starting on:", tf
        motif = getattr(tfdf, tf)
        if motif_ic(motif) < 5:
            print "excluding", tf, "for low IC"
            continue
        bio_ic = motif_ic(motif)
        n = len(motif)
        L = len(motif[0])
        matrix = matrix_from_motif(motif)
        sigma = sigma_from_matrix(matrix)
        mu = approximate_mu(matrix, n, G)
        Ne = estimate_Ne(matrix, mu, n, bio_ic)
        spoofs = []
        ar = 0
        spoof_trials = 0.0
        while len(spoofs) < samples:
            spoof_trials += 1
            matrix, chain = sella_hirsch_mh(Ne=Ne,
                                            mu=mu,
                                            n=1,
                                            matrix=sample_matrix(L, sigma),
                                            init='ringer',
                                            iterations=iterations)
            spoof_motif = concat(
                [random.choice(chain[iterations / 2:]) for i in range(n)])
            if abs(motif_ic(spoof_motif) - bio_ic) < delta_ic:
                spoofs.append(spoof_motif)
                ar += 1
            print "spoof acceptance rate:", ar / spoof_trials, len(
                spoofs), samples, spoof_trials
        #spoofs = [chain[-1] for (spoof_matrix,chain,Ne) in [spoof_motif(motif,Ne) for i in range(samples)]]
        results_dict[tf] = {
            fname: map(eval(fname), spoofs)
            for fname in "motif_ic motif_gini total_motif_mi".split()
        }
        print "finished:", tf, "(%s/%s)" % (tf_idx, len(tfdf.tfs))
        print bio_ic, mean_ci(results_dict[tf]['motif_ic'])
    return results_dict
def spoof_motifs_occ(motif,
                     num_motifs=10,
                     trials=1,
                     sigma=None,
                     Ne_tol=10**-4,
                     double_sigma=True):
    N = len(motif)
    L = len(motif[0])
    copies = 10 * N
    if sigma is None:
        sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1))
    epsilon = (1 + double_sigma) * sigma  # 15 Jan 2016
    print "sigma:", sigma
    #bio_ic = motif_ic(motif)
    mat = matrix_from_motif(motif)
    eps = [score_seq(mat, site) for site in motif]
    mu = gle_approx_mu(mat, copies)
    bio_occ = mean([1 / (1 + exp(ep - mu)) for ep in eps])

    def f(Ne):
        return expected_occupancy(epsilon, Ne, L, copies) - bio_occ

    Ne = log_regress_spec2(f, [1, 10], tol=10**-3)
    return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
예제 #10
0
def score_sd(motif):
    matrix = matrix_from_motif(motif)
    eps = [score_seq(matrix, site) for site in motif]
    return sd(eps)