def rolling_window_correlation_exp(): """What is the radius of the rolling average window that maximizes the correlation between TRAP probabilities, chIP-seq read intensities?""" pssm = PSSM(arac_motif) genome = get_ecoli_genome() traps = pssm.slide_trap(genome) beta = 1.61 #approximately = 1/(kBT) in kcal/mol @ room temp z = sum(exp(-beta * ep) for ep in traps) probs = [exp(-beta * ep) / z for ep in traps] corrs = ([(k, spearmanr(probs[:10**5], circular_rolling_average(chip_seq_data[:10**5], k))[0]) for k in verbose_gen(range(250, 750, 25))]) plt.plot(*transpose(corrs)) plt.xlabel("Rolling Window Radius (bp)") plt.ylabel(r"Spearman $\rho$") def smartwrap(text): from textwrap import wrap return "\n".join(wrap(text)) plt.subplots_adjust(top=0.8) plt.title( smartwrap( "Spearman Correlation vs. Radius of Rolling Average for TRAP probabilities, ChIP-seq reads (AraC a) on bp 1-10**5 of E. coli genome" )) plt.savefig("correlation_vs_rolling_average_radius.png", dpi=400) """Conclusion: correlation is maximized for window radius 400-500
def unflip_motif(motif): """Given a collection of possibly reverse complemented sites,unflip them""" from sufficache import PSSM mutable_motif = motif[:] for i,site in enumerate(motif): loo_motif = [s for (j,s) in enumerate(motif) if not i == j] pssm = PSSM(loo_motif) fd_score = pssm.score(site,both_strands=False) bk_score = pssm.score(wc(site),both_strands=False) print site print fd_score,bk_score if bk_score > fd_score: mutable_motif[i] = wc(site) return mutable_motif
def hill_coefficient_exp(tf_name,approx=False): """What is the effective hill coefficient of a binding site?""" motif = getattr(Escherichia_coli,tf_name) pssm = PSSM(motif) real_copies = copy_dict[tf_name][0] approx_copies = copy_dict[tf_name][1] copies = real_copies if not approx else approx_copies ns = [] x_ks = [] for site in motif: site_energy = pssm.trap(site) xs = copies ys = map(lambda mu:fermi_dirac(site_energy,mu),mus) plt.plot(xs,ys) x_k,n = fit_hill_function(xs,ys) print site,site.operon,n,x_k x_ks.append(x_k) ns.append(n) #plt.semilogx() #plt.show() return ns,x_ks
def rolling_window_correlation_exp(): """What is the radius of the rolling average window that maximizes the correlation between TRAP probabilities, chIP-seq read intensities?""" pssm = PSSM(arac_motif) genome = get_ecoli_genome() traps = pssm.slide_trap(genome) beta = 1.61 #approximately = 1/(kBT) in kcal/mol @ room temp z = sum(exp(-beta*ep) for ep in traps) probs = [exp(-beta*ep)/z for ep in traps] corrs = ([(k,spearmanr(probs[:10**5], circular_rolling_average(chip_seq_data[:10**5],k))[0]) for k in verbose_gen(range(250,750,25))]) plt.plot(*transpose(corrs)) plt.xlabel("Rolling Window Radius (bp)") plt.ylabel(r"Spearman $\rho$") def smartwrap(text): from textwrap import wrap return "\n".join(wrap(text)) plt.subplots_adjust(top=0.8) plt.title(smartwrap("Spearman Correlation vs. Radius of Rolling Average for TRAP probabilities, ChIP-seq reads (AraC a) on bp 1-10**5 of E. coli genome")) plt.savefig("correlation_vs_rolling_average_radius.png",dpi=400) """Conclusion: correlation is maximized for window radius 400-500
return "".join([line.strip() for line in f.readlines()[1:]]) genome = get_mycobacterium_genome() #Site1 in smollett_data does not appear in Myco genome. (We use #strain H37Rv; they use strain 1424 which is derived from the former.) #For this reason, we need to revise the genome in order to stitch the site in. start_coordinate = (3811492 #start position of region listed in Table 1 +158) # position of strongest site, relative to # start position site1 = smollett_sites[1] revised_genome = subst(genome,site1,start_coordinate) model = PSSM(smollett_sites.values()) traps = model.slide_trap(revised_genome) exact_copies = [sum(fd_probs(traps,mu,beta)) for mu in verbose_gen(mus)] z = sum(exp(-beta*ep) for ep in traps) approx_copies = [approximate_copy_number_from_mu(traps,mu,z) for mu in verbose_gen(mus)] absolute_ns_energy = -8 #kBT = -5 kca/mol width = len(site1) ep_ns = 2*width + absolute_ns_energy #Assume binding energy is -2kbt/match offset = lambda ep:log(exp(-beta*ep) + exp(-beta*ep_ns))/-beta ns_traps = map(offset,traps) coordinates = [smollett_data[i][0] for i in range(1,25+1)] scores = [smollett_data[i][1] for i in range(1,25+1)] regions = [genome[start_pos:end_pos+18] for (start_pos,end_pos) in coordinates]
"""This script generates a binding landscape for each TF""" import sys sys.path.append("src/sufficache") from sufficache import PSSM sys.path.append("data/motifs") from motifs import * from chem_pot_utils import get_ecoli_genome from array import array if __name__ == "__main__": "usage: generate_binding_landscapys.py tf_name [control]" tf_name = sys.argv[1] control = len(sys.argv) == 3 and sys.argv[2] == "control" genome = get_ecoli_genome() if not control else random_site(len(get_ecoli_genome())) print "Generating %s landscape for %s " % ("Control" * control,tf_name) tf = getattr(Escherichia_coli,tf_name) pssm = PSSM(tf) binding_energies = pssm.slide_trap(genome) arr = array('f') arr.extend(binding_energies) if not control: fname = "results/binding_landscapes/%s_genome_binding_landscape.dat" % tf_name else: fname = "results/binding_landscapes/%s_control_binding_landscape.dat" % tf_name with open(fname,'w') as f: arr.tofile(f)