def main(fn): # read in the CAI table G = GeneticCode("euplotid_genetic_code.txt") G.read_CAI_table("euplotid_CAI_table.txt") c = 0 for seq_record in SeqIO.parse(fn, "fasta"): if c > 100: break sequence = str(seq_record.seq) s = Sequence(sequence) s.set_genetic_code(G) # s.truncate() s.build_tree() # get the first stop in the first frame main_orf = "" for m, n in s.unique_stop_sequence: if m == 0 and n > 0: # exclude the terminal frame markers e.g. (0,-1) main_orf = s.sequence[:n] break s.estimate_frameshift_CAI() with open(seq_record.id + ".cai", 'w') as f: if main_orf != "": t = Sequence(main_orf) t.set_genetic_code(G) t.estimate_CAI() print >> f, t.repr_as_row() for fs in s.frameshift_sequences: print >> f, s.frameshift_sequences[fs].repr_as_row() c += 1
def parseSeq(self,fileSeq): #================================================= #==== A METTRE LORS DE LA REMISE === #inputFile = input("fullPath of Sequence file : ") inputFile = fileSeq #================================================= f = open(inputFile,"r") initSeq = ">" seq = "" listSequences = [] for line in f: if line[0] == initSeq: # On ajoute le precedent element if len(seq) > 0: listSequences.append(Sequence(seq,name,ref,number)) splittedLine = line.split("|") name = splittedLine[0][1:] ref = splittedLine[1] number = splittedLine[2] seq = "" else: seq += line if seq[-1] == "\n": seq = seq[:len(seq)-1] # Add last listSequences.append(Sequence(seq,name,ref,number)) f.close() return listSequences
def main(gc): G = GeneticCode(gc) # G.build_CAI_table( "/Users/paulkorir/Dropbox/Euplotes/FrameshiftPredictionData/E.crassus_CDS.fasta" ) G.build_CAI_table( "/home/paul/bioinf/Resources/H_sapiens/H_sapiens_Ens75_CDS.fasta") #G.build_CAI_table( "a_fasta.fa" ) G.write_CAI_table("CAI_tables/homo_CAI_table.txt") sys.exit(0) # G.write_CAI_table( "euplotid_CAI_table.txt" ) #G.write_CAI_table( "test_CAI_table.txt" ) G.read_CAI_table("euplotid_CAI_table.txt") #print G s = Sequence( "TAGAGATACACTGACTTACTTTCAAATACTATAAAACGGAATAGCCTAAGAATGAAATAAAGTAAAACATGACCATCAGGAGAAAGTTGAACAACTAGAGAGGGAGAATATTAAGCTTTATGCCCAATTAAAAAAGCTTGCAAAAAGTGAAAGAAATCTAATGAAGAAACTAGACGAAAGAGACCGGGAGATAACCAATCTAAAAGATACAAACATGAGGTTCAATTACAAACTCAATAGAGCACTCTATGCTAATGAAGAGCTGCAAAATAAAGTAACTGAATCTGACTACAAACTTCAACAAAAAAGAGATGAATTTATGAAAGACATAGAGCAAACTAACCAAATCC" ) #s = Sequence( "TCAAACCGAGACTTACTAAAGTTGATCATCATAAGACTC" ) s.set_genetic_code(G) s.estimate_CAI() # s.as_codons = True print s # print s.CAI_score s.truncate() print s s.build_tree() # print s.tree for i in xrange(3): print s.binary_frame(i, "") s.estimate_frameshift_CAI() for fs in s.frameshift_sequences: print s.frameshift_sequences[fs].repr_as_row()
def getD2StarWeight(self,seqA,seqB,k,r,flag,sequences,kmersetdic,weight,kmer_pro): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(seqLis,k) # 获取kmer概率 # Ma=markov.Markov() # kmerPA={} # kmerPB={} if flag==False: lisFeaA=Sq.getD2SCount(seqA,seqLis,k,r,flag,dic) lisFeaB=Sq.getD2SCount(seqB,seqLis,k,r,flag,dic) # kmerPA=Ma.get_Single_kmer_Pro(seqA,seqLis,k,r) # kmerPB=Ma.get_Single_kmer_Pro(seqB,seqLis,k,r) else: lisFeaA=Sq.getD2SCount(seqA,seqLis,k,r,flag,kmersetdic,kmer_pro) lisFeaB=Sq.getD2SCount(seqB,seqLis,k,r,flag,kmersetdic,kmer_pro) # kmerPA=Ma.get_Mul_kmer_Pro(seqA,sequences,k,r) # kmerPB=Ma.get_Mul_kmer_Pro(seqB,sequences,k,r) #计算D2Star su=0.0 lenA=len(seqA) lenB=len(seqB) for key in dict.keys(lisFeaA): su=su+(lisFeaA[key]*lisFeaB[key])/math.sqrt(lenA*kmer_pro[key]*lenB*kmer_pro[key])*weight[key] return 1/(su+np.spacing(1))
def fetch_shot(self, m_shot_code): # use the sequence matching regular expression here instead of hard coding m_shot_code[0:5] matchobject = DBAccessGlobals.DBAccessGlobals.g_shot_regexp.search(m_shot_code) shot = None seq = None # make sure this file matches the shot pattern if not matchobject: raise ValueError("Shot name provided %s does not match regular expression!"%m_shot_code) else: shot = matchobject.groupdict()['shot'] seq = matchobject.groupdict()['sequence'] local_seq = Sequence.Sequence(seq, DBAccessGlobals.DBAccessGlobals.get_path_for_sequence(seq), -1) dbseq = self.fetch_sequence(seq) local_seq.g_dbid = dbseq.g_dbid shot_ret = Shot.Shot(shot, DBAccessGlobals.DBAccessGlobals.get_path_for_shot(shot), -1, local_seq, None, 1001, 1009, 1092, 1100, 84) shot_table = self.g_tinydb.table('Shot') shot_query = tinydb.Query() dbshot = shot_table.get(shot_query.code == m_shot_code) if dbshot: shot_ret.g_dbid = dbshot.doc_id shot_ret.g_task_template = dbshot.task_template shot_ret.g_head_in = dbshot.sg_head_in shot_ret.g_cut_in = dbshot.sg_cut_in shot_ret.g_cut_out = dbshot.sg_cut_out shot_ret.g_tail_out = dbshot.sg_tail_out shot_ret.g_cut_duration = dbshot.sg_cut_duration return shot_ret
def pcc(self,seqA,seqB,k): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(seqLis,k) lisFea,freq=Sq.getSeqfreq(seqLis,k,dic) #计算平均值 meanA=0.0 meanB=0.0 for i in range(freq.shape[1]): meanA=freq[0,i]+meanA meanB=freq[1,i]+meanB meanA=meanA/freq.shape[1] meanB=meanB/freq.shape[1] #计算协方差 cov=0.0 for i in range(freq.shape[1]): cov=cov+(freq[0,i]-meanA)*(freq[1,i]-meanB) cov=cov/(freq.shape[1]-1) # 计算方差 stA=np.std(freq[0,:],ddof=1) stB=np.std(freq[1,:],ddof=1) #计算pcc pcc=cov/(stA*stB) return abs(1/pcc)
def main(fn): TM = TransitionMatrix() TM.read("euplotid_transition_matrix.pic") pdf = PdfPages("likelihood_profiles_test.pdf") b = 0 # count the ones that pass c = 0 # count all for seq_record in SeqIO.parse(fn, "fasta"): if c > 1000: break sequence = str(seq_record.seq) seq_name = seq_record.id s = Sequence(sequence=sequence, name=seq_name) s.truncate(effect_truncation=True, verbose=False) no_of_leaves = s.count_leaves() if no_of_leaves > 1000: print >> sys.stderr, "Complex tree with %s leaves...omitting." % no_of_leaves continue s.set_transition_matrix(TM) s.build_tree() s.get_frameshift_signals() s.estimate_likelihood() s.estimate_frameshift_likelihood() s.get_most_likely_frameshift() if s.most_likely_frameshift is not None: if 1 < len(s.most_likely_frameshift.path) < 4: #s.plot_differential_graded_likelihood( outfile=pdf, show_path_str=True ) s.plot_differential_graded_likelihood() b += 1 c += 1 pdf.close() print >> sys.stderr, "Processed %d (of %d) sequences [%.2f%%]." % ( b, c, b / c * 100)
def main( fn, seq_name ): TM = TransitionMatrix() TM.read( "transition_matrices/euplotid_transition_matrix.pic" ) # find the sequence we're looking for found = False for seq_record in SeqIO.parse( fn, "fasta" ): if seq_record.id == seq_name: sequence = str( seq_record.seq ) s = Sequence( sequence=sequence, name=seq_name ) s.truncate( effect_truncation=True, verbose=False ) no_of_leaves = s.count_leaves() if no_of_leaves > 1000: print >> sys.stderr, "Complex tree with %s leaves...omitting." % no_of_leaves continue s.set_transition_matrix( TM ) s.build_tree() s.get_frameshift_signals() s.estimate_likelihood() s.estimate_frameshift_likelihood() s.get_most_likely_frameshift() s.get_indexes() s.repr_frameshift_sites( include_nulls=False ) # print s.most_likely_frameshift.path # print s.most_likely_frameshift.partial_gradients s.plot_differential_graded_likelihood( show_name=False, show_starts=False, show_ML=False ) found = True break if not found: print >> sys.stderr, "Sequence %s was not found." % seq_name
def trans_SingleSeq_Matrix(self, sequence, sequences, r): if r <= 0: print("r<=0,无转移矩阵,请重新输入") return lis = [] lis.append(sequence) # 获取前缀个数 Sq = Sequence.Sequence() kmerset, dic = Sq.getSeqKerSet(sequences, r) # 统计个数 dic0, count0 = Sq.getSeqCount(lis, r, dic) # 去掉最后一位 dic0[0][sequence[-r:]] = dic0[0][sequence[-r:]] - 1 # 获取后缀个数 kmerset1, dic1 = Sq.getSeqKerSet(sequences, r + 1) # 统计个数 dic2, count2 = Sq.getSeqCount(lis, r + 1, dic1) resultdic = dict.copy(dic2[0]) for key in dic2[0].keys(): if dic0[0][key[0:-1]] == 0: resultdic[key] = 0 else: resultdic[key] = dic2[0][key] / dic0[0][key[0:-1]] return resultdic
def get_Mul_kmer_Pro(self, sequences, k, r): if r >= k: r = 0 Sq = Sequence.Sequence() # 单条序列的kmerset,dic # kmerSet,dic=Sq.getSingleSeqKerSet(sequence,k) kmers, dic1 = Sq.getSeqKerSet(sequences, k) resultdic = dict.copy(dic1) # print("sad",resultdic) # 初始概率: initProdic = self.init_MUl_pro(sequences) if r == 0: for kmer in dict.keys(dic1): pro = 1 for i in range(len(kmer)): pro = initProdic[kmer[i]] * pro resultdic[kmer] = pro elif r < 0: print("r的值设定有误,不能小于0") else: # 状态转移 transdic = self.trans_MulSeq_Matrix(sequences, r) for kmer in dict.keys(dic1): # 初始概率 pro = 1 for i in range(r): pro = initProdic[kmer[i]] * pro # kmer概率 for loc in range(len(kmer) - r): pro = pro * transdic[kmer[loc:loc + r + 1]] resultdic[kmer] = pro return Sq.addfloat(resultdic)
def main(fasta_file): bad_sequence = [ "comp1705_c0_seq1", "comp1716_c0_seq1", "comp1809_c0_seq1", "comp2102_c0_seq1", "comp2215_c0_seq1", "comp2215_c0_seq2", "comp2216_c0_seq1", "comp2216_c0_seq2" ] c = 0 for seq_record in SeqIO.parse(fasta_file, "fasta"): if c > 10: break if seq_record.id.split(" ")[0] in bad_sequence: continue fname = seq_record.id.split(" ")[0] + ".fa" sequence = str(seq_record.seq) # first_start = sequence.find( "ATG" ) # if first_start < 0: # print >> sys.stderr, "Missing start codon in sequence %s" % seq_record.id # continue # else: # sequence = sequence[first_start:] s = Sequence(sequence) # s.truncaste() s.build_tree() with open(fname, 'w') as f: for k in s.frameshift_sequences: F = s.frameshift_sequences[k] # a FrameshiftSequence object print >> f, ">%s" % "|".join(map( lambda x: "%s:%s" % x, F.path)) + ";" + ",".join(F.signals) print >> f, F.frameshifted_sequence c += 0
def allignSequences(S, T): S = '-' + S T = '-' + T n = len(S) m = len(T) Seq, V = [], [] #Defining the matrices for i in range(n): V.append([0.0] * m) lists = [] for j in range(m): lists.append(Sequence()) Seq.append(lists) #Initializing the matrices for i in range(n): V[i][0] = i * -1 Seq[i][0].s = S[1:i + 1] Seq[i][0].t = '-' * i for j in range(m): V[0][j] = j * -1 Seq[0][j].s = '-' * j Seq[0][j].t = T[1:j + 1] #Dynamic programming approach for i in range(1, n): for j in range(1, m): ch = getMax(i, j, S, T, Seq, V) putMax(ch, i, j, S, T, Seq, V) #Return result return (V[n - 1][m - 1], Seq[n - 1][m - 1], getMeasure(Seq[n - 1][m - 1]))
def test_MassUpdate(): St = States.clsStatesTable() Sq = Sequence.clsSequence() Sq.Import("csv/test/testSeq.csv") for step in Sq.Steps: St.Update(step.state0) St.Update(step.state1) assert St.len == 100
def main(): s = 'AUGGAAGAAGAAGAAGAAGAAGAAGAAGAAGAAUAGGAAGAAGAAGAAGAAGAAGAAGAAGAAGAAGAAUAG' s = Sequence.Sequence(no_of_shifts=3, min_length=50, max_length=100) s.generate_frameshift_sequence() print s.info() print s print print get_stop_pos2(str(s))
def test_MassUpdate3(): St = States.clsStatesTable() Sq = Sequence.clsSequence() Sq.Import("csv/test/testSeq.csv") X = [step.state0 for step in Sq.Steps] r = [-1.5 for step in Sq.Steps] St.Update(X, reward=r) for i in range(St.len): St[i].reward = -1.5 assert St.len == 99
def track(self, image): left = max(round(self.position[0] - float(self.window) / 2), 0) top = max(round(self.position[1] - float(self.window) / 2), 0) right = min(round(self.position[0] + float(self.window) / 2), image.shape[1] - 1) bottom = min(round(self.position[1] + float(self.window) / 2), image.shape[0] - 1) if right - left < self.template.shape[1] or bottom - top < self.template.shape[0]: return Sequence.Rectangle(self.position[0] + self.size[0] / 2, self.position[1] + self.size[1] / 2, self.size[0], self.size[1]) cut = image[int(top):int(bottom), int(left):int(right)] matches = cv2.matchTemplate(cut, self.template, cv2.TM_CCOEFF_NORMED) min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(matches) self.position = (left + max_loc[0] + float(self.size[0]) / 2, top + max_loc[1] + float(self.size[1]) / 2) return Sequence.Rectangle(left + max_loc[0], top + max_loc[1], self.size[0], self.size[1])
def test_MassUpdate2(): St = States.clsStatesTable() Sq = Sequence.clsSequence() Sq.Import("csv/test/testSeq.csv") X = [step.state0 for step in Sq.Steps] Q = [[-1, -2, -3] for step in Sq.Steps] St.Update(X, Q=Q) for i in range(St.len): St[i].Q = [-1, -2, -3] assert St.len == 99
def test_Reset(): St = States.clsStatesTable() Sq = Sequence.clsSequence() Sq.Import("csv/test/testSeq.csv") for step in Sq.Steps: St.Update(step.state0) St.Update(step.state1) St.Reset() assert St[0] == None assert St.len == 0
def fetch_sequence(self, m_seq_code): seq_path = DBAccessGlobals.DBAccessGlobals.get_path_for_sequence(m_seq_code) seq_ret = Sequence.Sequence(m_seq_code, seq_path, -1) seq_table = self.g_tinydb.table('Sequence') seq_query = tinydb.Query() dbseq = seq_table.get(seq_query.code == m_seq_code) if dbseq: seq_ret.g_dbid = dbseq.doc_id return seq_ret
def getMulD2StarWeight(self,seqA,seqB,kstart,kend,r,flag,sequences,weight,kmer_pro): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) # 获取 关键字集合 字典dic Sq=Sequence.Sequence() lisFeaA=Sq.getD2StarMulCount(seqA,sequences,kstart,kend,r,flag,kmer_pro) lisFeaB=Sq.getD2StarMulCount(seqB,sequences,kstart,kend,r,flag,kmer_pro) su=0.0 for key in dict.keys(lisFeaA): su=su+(lisFeaA[key]*lisFeaB[key])*weight[key] return 1/(su+np.spacing(1))
def manhattan(self,seqA,seqB,k): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(seqLis,k) lisFea,freq=Sq.getSeqfreq(seqLis,k,dic) su=0.0 for key in kmerSet: su=abs(lisFea[0][key]-lisFea[1][key])+su return su
def EuD(self,seqA,seqB,k): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(seqLis,k) lisFea,freq=Sq.getSeqfreq(seqLis,k,dic) su=0.0 for key in kmerSet: su=(lisFea[0][key]-lisFea[1][key])**2+su return math.sqrt(su)
def getD2(self,seqA,seqB,k): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(seqLis,k) lisFea,count=Sq.getSeqCount(seqLis,k,dic) #计算D2 su=0.0 for i in range(count.shape[1]): su=su+count[0,i]*count[1,i] return 1/(su+np.spacing(1))
def getMulD2Weight(self,seqA,seqB,kstart,kend,sequences,weight): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic lisFea=Sq.getMulCount(seqLis,kstart,kend,sequences) # print(lisFea) #计算D2 su=0.0 for key in lisFea[0]: su=su+lisFea[0][key]*lisFea[1][key]*weight[key] return 1/(su+np.spacing(1))
def getD2Weight(self,seqA,seqB,k,sequences,weight): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(sequences,k) lisFea,count=Sq.getSeqCount(seqLis,k,dic) #计算D2 su=0.0 for key in lisFea[0]: su=su+lisFea[0][key]*lisFea[1][key]*weight[key] return 1/(su+np.spacing(1))
def chebyshev(self,seqA,seqB,k): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(seqLis,k) lisFea,freq=Sq.getSeqfreq(seqLis,k,dic) #计算切比雪夫距离 ma=-sys.maxsize-1 for i in range(freq.shape[1]): ma=max(ma,abs(freq[0,i]-freq[1,i])) return ma
def test_Key(): St = States.clsStatesTable() Sq = Sequence.clsSequence() Sq.Import("csv/test/testSeq.csv") for step in Sq.Steps: St.Update(step.state0) St.Update(step.state1) c = 0 for i in range(10): for j in range(10): t = 1 if i == 9 and j == 9 else 0 stat = [float(i), float(j), t] assert St[stat].features == stat c += 1
def getD2_single(self,seqA,seqB,kstart,kend): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic countLis=Sq.getMulCount(seqLis,kstart,kend,seqLis) # kmerSet,dic =Sq.getSeqKerSet(seqLis,k) # lisFea,count=Sq.getSeqCount(seqLis,k,dic) #计算D2 su=0.0 for key in range(dict.keys(countLis[0])): su=su+countLis[0,key]*countLis[1,key] return 1/(su+np.spacing(1))
def KLD(self,seqA,seqB,k): seqLis=[] # 变成list seqLis.append(seqA) seqLis.append(seqB) Sq =Sequence.Sequence() # 获取 关键字集合 字典dic kmerSet,dic =Sq.getSeqKerSet(seqLis,k) lisFea,freq=Sq.getSeqfreq(seqLis,k,dic) k1=0.0 k2=0.0 for j in range(freq.shape[1]): k1=k1+freq[0,j]*math.log2(freq[0,j]/freq[1,j]) k2=k2+freq[1,j]*math.log2(freq[1,j]/freq[0,j]) return (k1+k2)/2
def test_Sort(): St = States.clsStatesTable() Sq = Sequence.clsSequence() Sq.Import("csv/test/testSeq.csv") for step in Sq.Steps: St.Update(step.state0) St.Update(step.state1) St.Sort() c = 0 for i in range(10): for j in range(10): if not (i == 9 and j == 9): assert St[c].features == [float(i), float(j), 0] else: assert St[c].features == [float(i), float(j), 1] c += 1
#!/usr/bin/env python """ translate.py <filename> Translates a DNA sequence to a protein sequence """ import sys import Fasta import Sequence if len(sys.argv)!=2 or '-h' in sys.argv or '--help' in sys.argv: sys.exit(__doc__) w = 60 iFilename = sys.argv[1] faFile = Fasta.load_mfa_iter(iFilename) for header,seq in faFile: protein = Sequence.translate(seq) print '>%s' % header for i in xrange(0, len(protein), w): print protein[i:i+w]
#!/usr/bin/env python """ reverse_comp.py <filename> Prints the reverse complement of a DNA string (in Fasta format). """ import sys import Fasta import Sequence if len(sys.argv) != 2 or "-h" in sys.argv or "--help" in sys.argv: sys.exit(__doc__) iFilename = sys.argv[1] header, seq = Fasta.load(iFilename) seq = Sequence.reverse_complement(seq.upper()) print ">%s" % header for i in xrange(0, len(seq), 80): print seq[i : i + 80]