def find_flanking_regions(start,stop,strand,site,fna_filename): genome = genome_from_fna(fna_filename) offset = 100 ufr = genome[start-offset:start] dfr = genome[stop:stop+offset] if strand == -1: ufr,dfr = wc(dfr),wc(ufr) assert wc(ufr + site + dfr) in genome else: assert ufr + site + dfr in genome return ufr,dfr
def cumsum_test(): arca_reads = get_arca_reads(1000000) true_rdm = density_from_reads(arca_reads, G) pssm = make_pssm(Escherichia_coli.ArcA) comb_rdm = true_rdm[0] + true_rdm[1] print "fwd_scores" fwd_scores = score_genome_np(pssm, genome) print "rev_scores" rev_scores = score_genome_np(pssm, wc(genome)) scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores)) probs = np.exp(scores)/np.sum(np.exp(scores)) print "sorting scores" score_js = sorted_indices(scores)[::-1] # order scores from greatest to least print "sorting probs" prob_js = sorted_indices(probs)[::-1] # ditto plt.plot(cumsum(rslice(comb_rdm, score_js)), label="scores") plt.plot(cumsum(rslice(comb_rdm, prob_js)), label="boltzmann probs") comb_rdm_copy = list(comb_rdm) controls = 5 for i in range(controls): print i random.shuffle(comb_rdm_copy) plt.plot(cumsum(comb_rdm_copy), color='r') plt.legend(loc=0) plt.xlim(0, 1) plt.ylim(0, 1) plt.show()
def update_scores_np(fwd_scores,rev_scores,fwd_i,fwd_j,dw,w,genome): G = len(genome) rel_fwd_base = {v:k for (k,v) in base_index.items()}[fwd_j] rel_rev_base = wc(rel_fwd_base) rev_i = w - fwd_i - 1 fwd_dscores = (np.roll(np.array(list(genome)),-fwd_i) == rel_fwd_base) * dw rev_dscores = (np.roll(np.array(list(genome)),-rev_i) == rel_rev_base) * dw return fwd_scores + fwd_dscores,rev_scores + rev_dscores
def arca_motif_comparison(): arca_reads = get_arca_reads() true_rdm = density_from_reads(arca_reads, G) pssm = make_pssm(Escherichia_coli.ArcA) plt.plot(true_rdm[0]) plt.plot(true_rdm[1]) fwd_scores, rev_scores = score_genome_np(pssm, genome) scores = np.log(np.exp(fwd_scores) + np.exp(rev_scores)) sites = concat([(site, wc(site)) for site in Escherichia_coli.ArcA]) site_locations = [m.start(0) for site in sites for m in re.finditer(site, genome)] site_locations_np = np.zeros(G) for site_loc in site_locations: site_locations_np[site_loc] = 1 plt.plot(site_locations_np) plt.plot(scores)
def find_site(site,fna_filename,return_all=False): genome = genome_from_fna(fna_filename) fwd_regexp = re.compile(site) rev_regexp = re.compile(wc(site)) fwd_matches = [(lambda (start,stop):(start,stop,+1))(m.span()) for m in fwd_regexp.finditer(genome)] rev_matches = [(lambda (start,stop):(start,stop,-1))(m.span()) for m in rev_regexp.finditer(genome)] matches = fwd_matches + rev_matches #print matches if len(matches) == 1: print "found unique match for %s in %s" % (site,fna_filename) return head(matches) if not return_all else matches elif len(matches) > 1: print "found multiple matches for %s in %s" % (site,fna_filename) return (None,None,None) if not return_all else matches else: print "couldn't find' match for %s in %s" % (site,fna_filename) return (None,None,None) if not return_all else []
def model_f(dinuc): print dinuc oligos = model["oligo"].split() if dinuc in oligos: i = oligos.index(dinuc) d = {prop:model[prop][i] for prop in "twist tilt roll".split()} else: print "elsing" cunid = wc(dinuc) print cunid i = oligos.index(cunid) d = {prop:model[prop][i] for prop in "twist roll".split()} print "d" d["tilt"] = -model["tilt"][i] # flip sign of tilt if reverse complementing d["rise"] = model["rise"] print "returning d" return d
def find_site_ref(site,fna_filename): """WRONG: rev matches are indexed backwards""" genome = genome_from_fna(fna_filename) regexp = re.compile(site) fwd_matches = [(lambda (start,stop):(start,stop,+1))(m.span()) for m in regexp.finditer(genome)] rev_matches = [(lambda (start,stop):(start,stop,-1))(m.span()) for m in regexp.finditer(wc(genome))] matches = fwd_matches + rev_matches print matches if len(matches) == 1: print "found unique match for %s in %s" % (site,fna_filename) return head(matches) elif len(matches) > 1: print "found multiple matches for %s in %s" % (site,fna_filename) return head(matches) else: print "couldn't find' match for %s in %s" % (site,fna_filename) return (None,None,None)