def GC_analyzer(vis_flag, Seq, supth, infth): # fn="" # fasta=Fasta(fn) size = 20 if vis_flag: Seq_Analyzer(Seq).GC_window_analyzer_visual(size, supth, infth) _GC_ls, _tmp = Seq_Analyzer(Seq).GC_window_analyzer(size, supth, infth) print min(_GC_ls), max(_GC_ls), numpy.mean(_GC_ls), numpy.std(_GC_ls)
def search_tilling_primer(seg, ref, Tm, minLen, maxLen, disp, name, seq, n, length): f_primer, r_primer = Seq_Analyzer(seg).Find_Primer_at_Ends_Tm( L_res=2 * disp, R_res=2 * disp, Tm=Tm, minLen=minLen, maxLen=maxLen) score = 10. best_p = [-1, -1] for i in range(len(f_primer)): fp = f_primer[i] if off_target_binding(seq=fp[0], ref=ref, rc=False): continue pos_sc = 2 * ((fp[2] - disp) / disp)**2 GC_sc = (fp[3] - .5)**2 _sc = pos_sc + GC_sc if _sc < score: score = _sc best_p[0] = i if best_p[0] == -1: print "%s_%d: Foward Primer Not FOUND at Tm=%d!\n" % (name, n, Tm) return [], [] score = 10. fp = f_primer[i] for i in range(len(r_primer)): rp = r_primer[i] if off_target_binding(seq=rp[0], ref=ref, rc=True): continue if rp[2] < fp[2] + 100: # minimal product > 100 bp continue pos_sc = ((fp[2] - rp[2] - length) / length)**2 GC_sc = (rp[3] - .5)**2 _sc = pos_sc + GC_sc if _sc < score: score = _sc best_p[1] = i if best_p[1] == -1: print "%s_%d: Reverse Primer FOUND at Tm=%d!\n" % (name, n, Tm) return [], [] else: fp = f_primer[best_p[0]] rp = r_primer[best_p[1]] f_primer_seq = fp[0] r_primer_seq = rp[0] f_primer_name = "%s_%d_F" % (name, n) print f_primer_name r_primer_name = "%s_%d_R" % (name, n) f_primer_pos = seq.find(f_primer_seq) + 1 r_primer_pos = seq.find(Seq_Analyzer(r_primer_seq).rcSeq()) + 1 f_primer_Tm = str(fp[1]) r_primer_Tm = str(rp[1]) f_primer_GC = str(fp[3]) r_primer_GC = str(rp[3]) f_primer = [ f_primer_name, f_primer_seq, f_primer_Tm, f_primer_pos, f_primer_GC ] r_primer = [ r_primer_name, r_primer_seq, r_primer_Tm, r_primer_pos, r_primer_GC ] return f_primer, r_primer
def analyze_re_site(seq, re_ls, re_ls_II): """Export list of RE restriction sites. REQUIRE load_enzymes to provide RE list Currently only report the 1st binding site of re. Note that TypeIIs will have RE site out of the binding site. Returns list of (1) Name of REs (2) Sites in acscending seq """ re_name_ls = [] re_site_ls = [] for re in re_ls: re_name_ls.append(re[0]) seq = seq + seq[:len(re[1])] # For Circular plasmid seq = seq.upper() site_ls = [i for i in range(len(seq)) if seq.startswith(re[1], i)] re_site_ls.append(site_ls) for re in re_ls_II: re_name_ls.append(re[0]) re_site_1 = re[1] re_site_2 = Seq_Analyzer(re_site_1).rcSeq() site_ls_1 = [ i for i in range(len(seq)) if seq.startswith(re_site_1, i) ] site_ls_2 = [ i for i in range(len(seq)) if seq.startswith(re_site_2, i) ] site_ls = merge_ascend_ls(site_ls_1, site_ls_2) re_site_ls.append(site_ls) return re_name_ls, re_site_ls
def linkerPCR_primers(): """ Make Linker PCR Primers.""" chunks = "yeast_chr01_chunks.FA" dirn = "/Users/xuz02/Google_Drive/workspace/Python/150218_Leslie/" bacbone = "pZX4_lin.fa" output = "liner_primers.csv" v_fasta = Fasta(dirn + bacbone) c_fasta = Fasta(dirn + chunks) o_fp = open(dirn + output, "w") left = 58 right = 57 reverse = "ggccggccccagcttttgttc" forward = "cggccggccctatagtgagtcg" o_fp.write("Name, Forward Primer, Reverse Primer\n") for n in range(len(c_fasta.Seqs)): f_primer = c_fasta.Seqs[n][-right:] + forward r_primer = Seq_Analyzer(c_fasta.Seqs[n][:left]).rcSeq() + reverse name = c_fasta.Names[n] fn = "pZX4_" + name[:19] + ".fasta" fp = open(dirn + fn, "w") fp.write(">%s\n" % name) fp.write(c_fasta.Seqs[n] + v_fasta.Seqs[0]) fp.close() o_fp.write("%s, %s, %s\n" % (name[:19], f_primer, r_primer)) o_fp.close()
def batch_PCR_Primers_at_End(): folder = "/workspace/Python/170116_SynIV/minichunks/" savefile = "/workspace/Python/170116_SynIV/primers.csv" L_res = 2 R_res = 2 Tm_ls = range(52, 59) minLen = 20 maxLen = 50 primer_ls = [] output = open(savefile, "w+") _len = len(Tm_ls) for fn in os.listdir(folder): if "fasta" in fn: fa = Fasta(folder + fn) length = len(fa.Seqs) for n in range(length): seq = fa.Seqs[n] name = fa.Names[n] F_primer, R_primer, Tm_bin = Seq_Analyzer(seq).\ Find_Primer_at_Ends(L_res, R_res, Tm_ls, minLen, maxLen) primer_ls.append([name, F_primer, R_primer, Tm_bin]) for primer_sub_ls in primer_ls: for n in range(_len): if primer_sub_ls[3][n]: output.write(str(primer_sub_ls[0][:-1]) + ",") for i in [1, 2]: for j in range(4): output.write(str(primer_sub_ls[i][n][j]) + ",") output.write(str(primer_sub_ls[2][n][4]) + ",") output.write("\n")
def GC_tmp(): dirn = "/Users/xuz02/Downloads/" fn = "dra_mt.fa" # for fn in os.listdir(dirn): # if fn[-3:].upper() != "TXT" : # continue fasta = Fasta(dirn + fn) n = len(fasta.Seqs) for i in range(n): seq = fasta.Seqs[i] name = fasta.Names[i] GC_content, outlets = [], [] GC_content, outlets = Seq_Analyzer(seq).GC_window_analyzer( 20, 0.70, 0.40) if len(outlets) > 0: Seq_Analyzer(seq).GC_window_analyzer_visual(20, 0.70, 0.30, name)
def chr04_pcrtag_stat(): """Stat the pcrtags over the minichunks.""" import re dirn = "/workspace/Python/161212_megachunk_csPCR/" primers = "PCRtags_syn.csv" mega = "synIV_mega.fa" output = "chr04_pcrtag_stat.csv" mega_f = Fasta(dirn + mega) primers_fp = open(dirn + primers, "r") op = open(dirn + output, "w") primer_ls = [] csv = primers_fp.read() primer_ls = re.split("\r|,", csv) size = len(primer_ls) / 2 stats = [] for i in range(size): if "synF" in primer_ls[2 * i]: p_seq = primer_ls[2 * i + 1].upper() else: if "\n" not in primer_ls[2 * i + 1]: p_seq = Seq_Analyzer(primer_ls[2 * i + 1]).rcSeq().upper() for j in range(len(mega_f.Names)): sq = mega_f.Seqs[j].upper() if p_seq in sq: _pos = sq.find(p_seq) stats.append([ mega_f.Names[j], primer_ls[2 * i], primer_ls[2 * i], _pos ]) print "%s done!" % primer_ls[2 * i] for item in stats: op.write("%s, %s, %s, %d\n" % (item[0], item[1], item[2], item[3])) op.close() mega_f.close()
def minichunk_for_Twist(): dirn = "/Users/Zhuwei/Google_Drive/Project Data/ORDERS/Twist/" stat_name = "minichunk_stat.csv" GC_stat = [] rep_stat = [] stat_fn = open(dirn + stat_name, "w") fasta = Fasta("/Users/Zhuwei/Documents/Order_Twist_140607.fasta") for _n in range(fasta.size): _GC_ls = Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100) print "pass GC\n" _mean = numpy.mean(_GC_ls) _min = min(_GC_ls) _max = max(_GC_ls) _std = numpy.std(_GC_ls) GC_stat.extend([fasta.Names[_n], _min, _max, _mean, _std]) _rep_ls = Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3, 4, 2) print "pass rep\n" rep_stat.append(fasta.Names[_n]) rep_stat.extend(_rep_ls) rep_stat.append(";;") # for fn in os.listdir(dirn): # if fn[-5:].upper() != "FASTA" and fn[-2:].upper() != "FA": # continue # fasta=Fasta(dirn+fn) # for _n in range(fasta.size): # _GC_ls=Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100) # print"pass GC\n" # _mean=numpy.mean(_GC_ls) # _min=min(_GC_ls) # _max=max(_GC_ls) # _std=numpy.std(_GC_ls) # GC_stat.extend([fasta.Names[_n],_min,_max,_mean,_std]) # _rep_ls=Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3,5,2) # print "pass rep\n" # rep_stat.append(fasta.Names[_n]) # rep_stat.extend(_rep_ls) # rep_stat.append(";;") for _x in range(len(GC_stat)): stat_fn.write(str(GC_stat[_x]) + ";") if _x % 5 == 4: stat_fn.write("\n") for _x in rep_stat: if _x != ";;": stat_fn.write(str(_x) + ";") else: stat_fn.write(";\n")
def findSeq(self): Seq = str(self.findSeqLine.text()) print Seq if not str(self.currentSeq): return self.seqFindLst = Seq_Analyzer(self.currentSeq).Degen_searcher(Seq) _tmp_seq_str = "" for _pos in self.seqFindLst: _tmp_seq_str += (str(_pos) + " ; ") self.SeqEdit.hlightRegion(self.seqFindLst, len(Seq)) self.SelectEdit.setText(_tmp_seq_str)
def Chara_analyzer(): fasta = Fasta( "/Users/Zhuwei/Google_Drive/workspace/Python/141008_JIngchuan/\ S288C_reference_genome_R64-1-1_20110203\ /S288C_reference_sequence_R64-1-1_20110203.fsa") output = "/Users/Zhuwei/Google_Drive/workspace/Python/141008_Jingchuan/\ chara_stat_4.csv" fp = open(output, 'w') N = len(fasta.Names) word = { "C": ["C"], "G": ["G"], "A": ["A"], "T": ["T"], "CpG": ["CG"], "CpT": ["CT"], "ApG": ["AG"], "CpC": ["CC"], "GpG": ["GG"], "CpA": ["CA"], "TpG": ['TG'], "CCGG": ["CCGG"] } chara_sites = [] for i in range(N): seq = Seq_Analyzer(fasta.Seqs[i]) name = fasta.Names[i] for idx in word: word_lst = word[idx] count = 0 for site in word_lst: site_lst = seq.Degen_searcher(site) count += len(site_lst) chara_sites.append([name, site, len(site_lst), site_lst]) fp.write(name + "," + idx + "," + str(count) + "\n")
def off_target_binding(seq, ref, rc=False): seq_r = Seq_Analyzer(seq).rcSeq() if rc: if seq in ref: return True pos = ref.find(seq_r) ref = ref[:pos] + ref[pos + len(seq):] if seq_r in ref: return True else: if seq_r in ref: return True pos = ref.find(seq) ref = ref[:pos] + ref[pos + len(seq):] if seq in ref: return True if easy_blast(seq=seq, ref=ref): return True if easy_blast(seq=seq_r, ref=ref): return True return False
def minichunk_csPCR_primers_two_pairs(dirn, minichunk, chunk, Tm, ref="", Lim=150, pcr=300): """ Function to make junction primers for csPCR. NAME OF THE MINICHUNKS = CHUNKNAME.01-0X Two pairs of csPCR is generated dirn: working dir minichunk: name of the multi fasta file with all the minichunks ref: reference genome Tm : Expected Tm """ # upper limit of the length of the PCR product at each site half_PCR_length = pcr m_fasta = Fasta(dirn + minichunk) ref_fasta = Fasta(dirn + ref) c_fasta = Fasta(dirn + chunk) output_fn = "csPCR_primers1.csv" output = open(dirn + output_fn, "w") first_minichunk = [] second_minichunk = [] third_minichunk = [] num_minichunks = len(m_fasta.Seqs) for i in range(num_minichunks): print m_fasta.Names[i][-1] if m_fasta.Names[i][-1] == "1": first_minichunk.append([m_fasta.Seqs[i], m_fasta.Names[i]]) elif m_fasta.Names[i][-1] == "2": second_minichunk.append([m_fasta.Seqs[i], m_fasta.Names[i]]) elif m_fasta.Names[i][-1] == "3": third_minichunk.append([m_fasta.Seqs[i], m_fasta.Names[i]]) else: continue if (len(first_minichunk) != len(second_minichunk) or len(first_minichunk) != len(third_minichunk)): print("Not enough minichunks to make 2 pairs of csPCR primers!\n") return # forward primers at minichunk #1 # return arguments for Seq_Analyzer.Find_Forward_Primer = [seq, ] forward_12 = [] for minichunk in first_minichunk: seq = minichunk[0][-half_PCR_length:] primer = [] primer_name = minichunk[1][:-3] + "_F_junc12" primer.append(primer_name) primer.extend( Seq_Analyzer(seq).Find_Forward_Primer(Tm, Lim, crossref=ref_fasta.Seqs[0])) if len(primer) == 1: print("%s FAILED!\n" % (primer_name)) primer.extend(["", -1, -1, -1, -1, -1]) forward_12.append(primer) reverse_12 = [] forward_23 = [] for minichunk in second_minichunk: seqF = minichunk[0][-half_PCR_length:] seqR = minichunk[0][:half_PCR_length] primerF, primerR = [], [] primer_name_F = minichunk[1][:-3] + "_F_junc23" primer_name_R = minichunk[1][:-3] + "_R_junc12" primerF.append(primer_name_F) primerR.append(primer_name_R) primerF.extend( Seq_Analyzer(seqF).Find_Forward_Primer(Tm, Lim, crossref=ref_fasta.Seqs[0])) primerR.extend( Seq_Analyzer(seqR).Find_Reverse_Primer(Tm, Lim, crossref=ref_fasta.Seqs[0])) if len(primerF) == 1: print("%s FAILED!\n" % (primer_name_F)) primerF.extend(["", -1, -1, -1, -1, -1]) if len(primerR) == 1: print("%s FAILED!\n" % (primer_name_R)) primerR.extend(["", -1, -1, -1, -1, -1]) reverse_12.append(primerR) forward_23.append(primerF) reverse_23 = [] for minichunk in third_minichunk: seq = minichunk[0][:half_PCR_length] primer = [] primer_name = minichunk[1][:-3] + "_R_junc23" primer.append(primer_name) primer.extend( Seq_Analyzer(seq).Find_Reverse_Primer(Tm, Lim, crossref=ref_fasta.Seqs[0])) if len(primer) == 1: print("%s FAILED!\n" % (primer_name)) primer.extend(["", -1, -1, -1, -1, -1]) reverse_23.append(primer) # Verify the PCR products size = len(forward_12) if size != len(c_fasta.Seqs): print("# of reference chunks(%d) != # of primer request (%d)" % (len(c_fasta.Seqs), size)) return refseq = ref_fasta.Seqs[0] output.write("Name, Sequence, 5' at minichunk, length, GC ratio\ , flag_GC, flag_multi, Tm, 5' at chunk, 5' at genome, PCR at chunk\ , PCR at genome\n") for n in range(size): pf12 = forward_12[n] pr12 = reverse_12[n] pf23 = forward_23[n] pr23 = reverse_23[n] chunkseq = c_fasta.Seqs[n] loc_f12 = chunkseq.find(pf12[1]) + 1 loc_f23 = chunkseq.find(pf23[1]) + 1 loc_r12 = chunkseq.find(Seq_Analyzer(pr12[1]).rcSeq()) + len(pr12[1]) loc_r23 = chunkseq.find(Seq_Analyzer(pr23[1]).rcSeq()) + len(pr23[1]) gen_f12 = refseq.find(pf12[1]) + 1 gen_f23 = refseq.find(pf23[1]) + 1 gen_r12 = refseq.find(Seq_Analyzer(pr12[1]).rcSeq()) + len(pr12[1]) gen_r23 = refseq.find(Seq_Analyzer(pr23[1]).rcSeq()) + len(pr23[1]) len_pcr_12 = loc_r12 - loc_f12 len_pcr_23 = loc_r23 - loc_f23 len_gen_12 = gen_r12 - gen_f12 len_gen_23 = gen_r23 - gen_f23 pf12.extend([loc_f12, gen_f12, len_pcr_12, len_gen_12]) pf23.extend([loc_f23, gen_f23, len_pcr_23, len_gen_23]) pr12.extend([loc_r12, gen_r12, len_pcr_12, len_gen_12]) pr23.extend([loc_r23, gen_r23, len_pcr_23, len_gen_23]) primers = [pf12, pr12, pf23, pr23] for p in primers: for t in p: output.write(str(t) + " ,") output.write("\n") output.close()