def motif_degradation_experiment(): """what is the effect of repeatedly inferring a motif from selected sites?""" from motifs import Escherichia_coli motif = Escherichia_coli.LexA n = len(motif) matrix = matrix_from_motif(motif) assumed_copies = 10 * n mu = approximate_mu(matrix, assumed_copies, G) for i in range(10): print i, "motif ic:", motif_ic(motif) motif = select_sites_by_occupancy(matrix, mu, n) matrix = matrix_from_motif(motif)
def restriction_of_range_half_site_experiment(motif): """is energy of first half-site negatively correlated with energy of second half-site?""" L = len(motif[0]) l = L/2 mat = matrix_from_motif(motif) eps1 = [score_seq(mat[:l], site[:l]) for site in motif] eps2 = [score_seq(mat[l:], site[l:]) for site in motif] return pearsonr(eps1,eps2)
def restriction_of_range_motif_spoof_experiment(motifs): all_eps = [] all_spoof_eps = [] for motif in tqdm(motifs): mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] spoofs = spoof_psfm(motif, pc=0) spoof_eps = [score_seq(mat, site) for site in spoofs] all_eps.append(eps) all_spoof_eps.append(spoof_eps) return all_eps, all_spoof_eps
def spoof_motif(motif, Ne=None, iterations=10000): matrix = matrix_from_motif(motif) L = len(motif[0]) n = len(motif) sigma = sigma_from_matrix(matrix) spoof_matrix = sample_matrix(L, sigma) bio_ic = motif_ic(motif) # this method of reading site_mu, site_sigma off of motif is slightly suspect... site_mu = site_mu_from_matrix(matrix_from_motif(motif)) site_sigma = site_sigma_from_matrix(matrix_from_motif(motif)) # now need to find mu, nu n = len(motif) assumed_copies = 10 * n mu = approximate_mu(matrix, assumed_copies, G) spoof_mu = approximate_mu(spoof_matrix, assumed_copies, G) if Ne is None: Ne = estimate_Ne(spoof_matrix, spoof_mu, n, bio_ic) print "chose Ne:", Ne spoof_matrix, chain = sella_hirsch_mh(Ne=Ne, matrix=spoof_matrix, mu=mu, n=n) return spoof_matrix, chain, Ne
def restriction_of_range_loo_experiment(motif): """can energy of a given position be predicted from energy of remaining bases?""" L = len(motif[0]) mat = matrix_from_motif(motif) eps = [score_seq(mat,site) for site in motif] mean_ep = mean(eps) results = [] for j in range(L): print j loo_mat = mat[:j] + mat[j+1:] for site in motif: loo_ep = score_seq(loo_mat,site[:j] + site[j+1:]) pred_ep = mean_ep - loo_ep obs_ep = score_seq([mat[j]],[site[j]]) results.append((pred_ep, obs_ep)) return results
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None,Ne_tol=10**-4,double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10*N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif,pc=1)) epsilon = (1+double_sigma)*sigma # 15 Jan 2016 print "sigma:", sigma #bio_ic = motif_ic(motif) mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] mu = gle_approx_mu(mat, copies) bio_occ = mean([1/(1+exp(ep-mu)) for ep in eps]) def f(Ne): return expected_occupancy(epsilon, Ne, L, copies) - bio_occ Ne = log_regress_spec2(f,[1,10],tol=10**-3) return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
def analyze_bio_motifs(Nes,trials=20): results = {} for tf_idx,tf in enumerate(Escherichia_coli.tfs): Ne = Nes[tf] bio_motif = getattr(Escherichia_coli,tf) n,L = len(bio_motif),len(bio_motif[0]) bio_matrix = matrix_from_motif(bio_motif) sigma = sigma_from_matrix(bio_matrix) matrix_chains = [sella_hirsch_mh(n=n,L=L,sigma=sigma,Ne=Ne,init='ringer') for i in range(trials)] ics = [mean(map(motif_ic,chain[-1000:])) for (matrix,chain) in matrix_chains] ginis = [mean(map(motif_gini,chain[-1000:])) for (matrix,chain) in matrix_chains] mis = [mean(map(total_motif_mi,chain[-1000:])) for (matrix,chain) in matrix_chains] print "results for:",tf,tf_idx print motif_ic(bio_motif),mean(ics),sd(ics) print motif_gini(bio_motif),mean(ginis),sd(ginis) print total_motif_mi(bio_motif),mean(mis),sd(mis) results[tf] = (mean(ics),sd(ics),mean(ginis),sd(ginis),mean(mis),sd(mis)) return results
def main_experiment(samples=30, iterations=10000, delta_ic=0.1): results_dict = {} for tf_idx, tf in enumerate(tfdf.tfs): print "starting on:", tf motif = getattr(tfdf, tf) if motif_ic(motif) < 5: print "excluding", tf, "for low IC" continue bio_ic = motif_ic(motif) n = len(motif) L = len(motif[0]) matrix = matrix_from_motif(motif) sigma = sigma_from_matrix(matrix) mu = approximate_mu(matrix, n, G) Ne = estimate_Ne(matrix, mu, n, bio_ic) spoofs = [] ar = 0 spoof_trials = 0.0 while len(spoofs) < samples: spoof_trials += 1 matrix, chain = sella_hirsch_mh(Ne=Ne, mu=mu, n=1, matrix=sample_matrix(L, sigma), init='ringer', iterations=iterations) spoof_motif = concat( [random.choice(chain[iterations / 2:]) for i in range(n)]) if abs(motif_ic(spoof_motif) - bio_ic) < delta_ic: spoofs.append(spoof_motif) ar += 1 print "spoof acceptance rate:", ar / spoof_trials, len( spoofs), samples, spoof_trials #spoofs = [chain[-1] for (spoof_matrix,chain,Ne) in [spoof_motif(motif,Ne) for i in range(samples)]] results_dict[tf] = { fname: map(eval(fname), spoofs) for fname in "motif_ic motif_gini total_motif_mi".split() } print "finished:", tf, "(%s/%s)" % (tf_idx, len(tfdf.tfs)) print bio_ic, mean_ci(results_dict[tf]['motif_ic']) return results_dict
def spoof_motifs_occ(motif, num_motifs=10, trials=1, sigma=None, Ne_tol=10**-4, double_sigma=True): N = len(motif) L = len(motif[0]) copies = 10 * N if sigma is None: sigma = sigma_from_matrix(pssm_from_motif(motif, pc=1)) epsilon = (1 + double_sigma) * sigma # 15 Jan 2016 print "sigma:", sigma #bio_ic = motif_ic(motif) mat = matrix_from_motif(motif) eps = [score_seq(mat, site) for site in motif] mu = gle_approx_mu(mat, copies) bio_occ = mean([1 / (1 + exp(ep - mu)) for ep in eps]) def f(Ne): return expected_occupancy(epsilon, Ne, L, copies) - bio_occ Ne = log_regress_spec2(f, [1, 10], tol=10**-3) return [sample_motif(sigma, Ne, L, copies, N) for _ in range(num_motifs)]
def score_sd(motif): matrix = matrix_from_motif(motif) eps = [score_seq(matrix, site) for site in motif] return sd(eps)