def dbSumRep(inFileName): """ deprecated """ import reputer stat={"F":Stat(),"R":Stat(),"C":Stat(),"P":Stat()} file_db=open(inFileName) seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break hist=reputer.dist(seq,"-allmax -f -r -c -p -l 3") sumL={"F":0,"R":0,"C":0,"P":0} sumN={"F":0,"R":0,"C":0,"P":0} for i in hist: sumL[i[0]]=sumL[i[0]]+(i[1]*i[2]) sumN[i[0]]=sumN[i[0]]+i[2] for i in sumL.keys(): sumL[i]=-(math.log(float(sumL[i])/sumN[i])-math.log(seq.getLength())) stat[i].add(sumL[i]) numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',\ seq.header[0:40],'...]' print "\tF=%5.3f R=%5.3f C=%5.3f P=%5.3f" % \ (sumL["F"],sumL["R"],sumL["C"],sumL["P"]) file_db.close() for i in stat.keys(): print i,"=>",stat[i].string()
def dbORF( inFileName, nb=0, size=0, outFileName="" ): inFile = open( inFileName ) if outFileName == "": outFileName = inFileName + ".orf.map" outFile = open( outFileName, "w" ) seq = Bioseq() seq_out = Bioseq() numseq = 0 while 1: seq.read( inFile ) if seq.sequence == None: break seq.upCase() numseq = numseq + 1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' orf = seq.findORF() best_orf = [] for i in orf.keys(): l = len(orf[i]) for j in xrange(1,l): start = orf[i][j-1] + 4 end = orf[i][j] + 3 if end - start >= size: best_orf.append( ( end-start, i+1, start, end ) ) seq.sequence = seq.complement() orf = seq.findORF() seqlen = seq.getLength() for i in orf.keys(): l = len(orf[i]) for j in xrange(1,l): start = seqlen - orf[i][j-1] - 3 end = seqlen - orf[i][j] - 2 if start - end >= size: best_orf.append( ( start-end, (i+1)*-1, start, end ) ) best_orf.sort() best_orf.reverse() l = len(best_orf) if nb > l or nb == 0 : nb = l for i in xrange(0,nb): print best_orf[i] outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(best_orf[i][1])+\ "|"+str(best_orf[i][0]),seq.header, best_orf[i][2],best_orf[i][3])) inFile.close() outFile.close() return 0
def dbRelEntropy(inFileName,wordsize): """ deprecated """ file_db=open(inFileName) seq=Bioseq() refocc={} sumlen=0 while 1: seq.read(file_db) if seq.sequence==None: break sumlen=sumlen+seq.getLength()-wordsize occ=seq.occ_word(wordsize) if(len(refocc)==0): refocc=occ else: for w in occ.keys(): if refocc.has_key(w): refocc[w]=refocc[w]+occ[w] else: refocc[w]=occ[w] file_db.close() reffreq={} for w in refocc.keys(): reffreq[w]=float(refocc[w]+1)/sumlen vec_len=[] stat=Stat() file_db=open(inFileName) numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break i=seq.rel_entropy(reffreq) stat.add(i) numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...] entropy',i vec_len.append((i,numseq,seq.header)) file_db.close() vec_len.sort() for s in vec_len: print 'H=',s[0],'=> #',s[1], s[2] print stat.string() return vec_len
def dbLengthFilter(len_min,inFileName, verbose=0): file_db=open(inFileName) file_dbInf=open(inFileName+".Inf"+str(len_min),'w') file_dbSup=open(inFileName+".Sup"+str(len_min),'w') seq=Bioseq() numseq=0 nbsave=0 while 1: seq.read(file_db) if seq.sequence==None: break l=seq.getLength() numseq=numseq+1 if l>=len_min: seq.write(file_dbSup) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' nbsave=nbsave+1 else: seq.write(file_dbInf) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' nbsave=nbsave+1 file_db.close() file_dbInf.close() file_dbSup.close() if verbose > 0: print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
def dbEntropy(inFileName,wordsize): """ deprecated """ vec_len=[] stat=Stat() file_db=open(inFileName) seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break i=seq.entropy(wordsize) stat.add(i) numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...] entropy',i vec_len.append((-i,numseq,seq.header)) file_db.close() vec_len.sort() for s in vec_len: print 'I=',-s[0],'=> #',s[1], s[2] print stat.string() return vec_len
def dbWord(inFileName,wordsize): """ deprecated """ vec_len=[] file_db=open(inFileName) seq=Bioseq() numseq=0 nb_word=0 statCount={} while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' occ,nb=seq.occ_word(wordsize) nb_word=nb_word+nb for i in occ.keys(): if i not in statCount.keys(): statCount[i]=occ[i] else: statCount[i]=statCount[i]+occ[i] file_db.close() vec_sort=[] for i in statCount.keys(): vec_sort.append((-float(statCount[i])/nb_word,i)) vec_sort.sort() for i in vec_sort: print i[1],"=",-i[0]
def sortSequencesByIncreasingLength( inFile, outFile, verbose=0 ): """ Save sequences in 'inFile' into 'outFile' sorted by their length in increasing order. """ if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists( inFile ): print "ERROR: file '%s' doesn't exist" % ( inFile ) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open( inFile, "r" ) bs = Bioseq() countSeq = 0 while True: bs.read( inFileHandler ) if bs.header == None: break countSeq += 1 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) bs.save( tmpFile ) if verbose > 1: print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) bs.header = "" bs.sequence = "" inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists( outFile ): os.remove( outFile ) lFiles = glob.glob( "*bp_*nb" ) lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) for fileName in lFiles: cmd = "cat %s >> %s" % ( fileName, outFile ) returnValue = os.system( cmd ) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFile ) sys.exit(1) os.remove( fileName ) return 0
def dbTraduit(inFileName,phase=0,complement='T',pep_filename=""): """ deprecated """ file_db=open(inFileName) if pep_filename=="": pep_filename=inFileName+'.pep' file_pep=open(pep_filename,'w') seq=Bioseq() seq_out=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),\ '[',seq.header[0:40],'...]' if phase>=0 : if phase==1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase 1)" seq_out.write(file_pep) if phase==2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase 2)" seq_out.write(file_pep) if phase==3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase 3)" seq_out.write(file_pep) if complement=='T' or phase<0 : seq.sequence=seq.complement() if phase==-1 or phase==0 : seq_out.sequence=seq.traduit(1) seq_out.header=seq.header+" (phase -1)" seq_out.write(file_pep) if phase==-2 or phase==0 : seq_out.sequence=seq.traduit(2) seq_out.header=seq.header+" (phase -2)" seq_out.write(file_pep) if phase==-3 or phase==0 : seq_out.sequence=seq.traduit(3) seq_out.header=seq.header+" (phase -3)" seq_out.write(file_pep) file_db.close() file_pep.close()
def dbITRsearch(inFileName,len_min,mismatch,skip_len=20000): """ deprecated """ import reputer n=0 s=0 file_db=open(inFileName) file_out=open(inFileName+".stree_itr",'w') seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',\ seq.getLength(),'[',\ seq.header[0:40],'...]' if seq.getLength()<skip_len: rep=reputer.find(seq,"-p -l "+str(len_min)+\ " -e "+str(mismatch)) for i in rep.rep_list: if i.pos1 < 5 \ and i.pos2+i.length2>seq.getLength()-5: i.view() n=n+1 seq.write(file_out) break else: s=s+1 print ' too long, skipped' print n,"found ", s, "skipped" file_db.close() file_out.close()
def db2map( inFileName, map_filename="" ): file_db = open( inFileName , "r" ) if map_filename == "": map_filename = inFileName + ".map" file_map = open( map_filename, "w" ) seq = Bioseq() numseq = 0 while 1: seq.read( file_db ) if seq.sequence == None: break numseq = numseq + 1 line='sequence'+str(numseq)+'\t'+seq.header+'\t1'+'\t'+str(seq.getLength()) print line file_map.write( line + "\n" ) file_db.close() file_map.close() print "saved in ",map_filename
def dbComplement(inFileName,comp_filename=""): """ deprecated """ file_db=open(inFileName) if comp_filename=="": comp_filename=inFileName+'.comp' file_comp=open(comp_filename,'w') seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break numseq=numseq+1 print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]' seq.sequence=seq.complement() seq.header=seq.header+" (complement!)" seq.write(file_comp) file_db.close() file_comp.close()
def dbLength( inFileName ): """ return the length of each sequence in the input fasta file @param inFileName: name of the input fasta file @type inFileName: string """ if not os.path.exists( inFileName ): print "*** Error: %s doesn't exist" % ( inFileName ) sys.exit(1) vec_len = [] stat = Quantile() inFile = open( inFileName, "r" ) seq = Bioseq() numseq = 0 while 1: seq.read( inFile ) if seq.sequence == None: break l = seq.getLength() stat.add( l ) numseq = numseq + 1 print "sequence #",numseq,"=",l,"[",seq.header[0:40],"...]" vec_len.append( ( l, numseq, seq.header[0:40] ) ) inFile.close() vec_len.sort() for s in vec_len: print "len=",s[0],"=> #",s[1], s[2] print stat.string() print "total length=",stat.sum return vec_len
def dbConsensus(filename,consensus_filename,max_set_size=20,max_len=20000,min_len=50,min_base_nb=1): """ deprecated """ os.system("orienter "+filename) tmp_consensus_filename=filename+".oriented.consensus.tmp" size_db=dbSize(filename+".oriented") file_in=open(filename+".oriented") file_out=open(consensus_filename,'w') seq=Bioseq() if size_db==1: seq.read(file_in) seq.header="not a consensus" seq.write(file_out) file_out.close() file_in.close() os.system("cp "+filename+".oriented"+ " "+filename+ ".malign.fa") os.system("cp "+filename+".oriented"+ " "+filename+ ".malign.fa.cons") sys.exit(1) seq_in_set=0 nb_consensus=0 count_set=0 set_size=size_db while set_size>max_set_size: set_size=set_size/2 tmp_file_out=open(tmp_consensus_filename,'w') last_seq=0 while 1: #read subset of sequence seq.read(file_in) if seq.sequence!=None: if seq.getLength() < max_len and seq.getLength() > min_len: seq.write(tmp_file_out) seq_in_set=seq_in_set+1 else: if seq.getLength() > max_len: print seq.header+" too long!!" if not seq.header.find(" too long, not aligned"): seq.header=seq.header+" too long, not aligned" seq.write(file_out) if seq.getLength() < min_len: print seq.header+" too short!!" else: last_seq=1 if seq_in_set==0: return count_set # aligne subset if seq_in_set==set_size or last_seq: count_set=count_set+1 print "aligning the set #",count_set," of ",seq_in_set," sequences" tmp_file_out.close() if seq_in_set>1: os.system("nice malign "+tmp_consensus_filename +" 20 -8 16 4 > " +tmp_consensus_filename+".malign" +str(count_set)+".fa") os.system("nice consensusFastaAli.py -n " +str(min_base_nb)+" " +tmp_consensus_filename +".malign"+str(count_set)+".fa ") os.system("cp "+tmp_consensus_filename+ ".malign"+str(count_set)+".fa " +filename+ ".malign"+str(count_set)+".fa") else: os.system("cp "+tmp_consensus_filename+ " "+filename+ ".malign"+str(count_set)+".fa") os.system("cp "+tmp_consensus_filename+ " "+tmp_consensus_filename+ ".malign"+str(count_set)+".fa.cons") os.system("cat "+tmp_consensus_filename+ ".malign"+str(count_set)+\ ".fa.cons >> "+consensus_filename) seq_in_set=0 tmp_file_out=open(tmp_consensus_filename,'w') if set_size==size_db or last_seq: break tmp_file_out.close() file_out.close() file_in.close() os.system("rm "+tmp_consensus_filename+"* "+filename+".oriented" ) return count_set
def dbRepShow(inFileName,reputer_param="-allmax -f -r -c -p -l 10"): """ deprecated """ import reputer import Gnuplot tmpname=os.tmpnam(); g = Gnuplot.Gnuplot(debug=1) g('set data style lines') g('set terminal postscript landscape color') g('set output "'+inFileName+'.ps"') allsum=0 n=0 max=0 min=0 file_db=open(inFileName) seq=Bioseq() numseq=0 while 1: seq.read(file_db) if seq.sequence==None: break n=n+1 replist=reputer.find(seq,reputer_param) fileEmpty=open(tmpname+"E","w") fileF=open(tmpname+"F","w") fileR=open(tmpname+"R","w") fileC=open(tmpname+"C","w") fileP=open(tmpname+"P","w") F=0 R=0 C=0 P=0 fileEmpty.write("%d\t%d"%(seq.getLength(),seq.getLength())) for i in replist.rep_list: if i.type=="F": F=1 fileF.write(str(i.pos1)\ +"\t"+str(i.pos2)\ +"\n"+str(i.pos1+i.length1)\ +"\t"+\ str(i.pos2+i.length2)+"\n\n") elif i.type=="R": R=1 fileR.write(str(i.pos1)\ +"\t"+str(i.pos2)\ +"\n"+str(i.pos1+i.length1)\ +"\t"+\ str(i.pos2+i.length2)+"\n\n") elif i.type=="C": C=1 fileC.write(str(i.pos1)\ +"\t"+str(i.pos2)\ +"\n"+str(i.pos1+i.length1)\ +"\t"+\ str(i.pos2+i.length2)+"\n\n") elif i.type=="P": P=1 fileP.write(str(i.pos1)\ +"\t"+str(i.pos2)\ +"\n"+str(i.pos1+i.length1)\ +"\t"+\ str(i.pos2+i.length2)+"\n\n") fileF.close() fileR.close() fileC.close() fileP.close() fileEmpty.close() g.title(seq.header) cmd='l='+str(seq.getLength())+\ '\nset xrange [0:l]\nset yrange [0:l]\nplot "'+\ tmpname+'E" notitle with dots' if F: cmd=cmd+', "'+tmpname+'F" title "F" with lines 1 1' if R: cmd=cmd+', "'+tmpname+'R" title "R" with lines 2 1' if C: cmd=cmd+', "'+tmpname+'C" title "C" with lines 3 1' if P: cmd=cmd+', "'+tmpname+'P" title "P" with lines 4 1' g(cmd) numseq=numseq+1 file_db.close() os.system("gv "+inFileName+".ps") os.system("rm "+tmpname+"*")
def filterClassifConsensus( inFileName, outFileName, filterSSRs, maxLengthToFilterSSRs, filterHostGenes, filterConfused, filterNoCat, nbAlignSeqNoCat, verbose=0 ): """ Filter each consensus according to the classification in its header. @param inFileName: name of the input fasta file @type inFileName: string @param outFileName: name of the output fasta file @type outFileName: string @param filterSSRs: filter the consensus classified as SSR @type filterSSRs: boolean @param maxLengthToFilterSSRs: length below which a SSR is filtered @type maxLengthToFilterSSRs: integer @param filterSSRs: filter the consensus classified as HostGene @type filterSSRs: boolean @param filterConfused: filter the consensus classified as confused @type filterConfused: boolean @param filterNoCat: filter the consensus classified as NoCat @type filterNoCat: boolean @param nbAlignSeqNoCat: minimum number of sequences in the MSA from which the NoCat consensus as been built @type nbAlignSeqNoCat: string @param verbose: verbosity level @type verbose: integer """ if outFileName == "": outFileName = "%s.filtered" % ( inFileName ) nbAlignSeqNoCat = int( nbAlignSeqNoCat ) if verbose > 0: print "input file: %s" % ( inFileName ) print "output file: %s" % ( outFileName ) if filterSSRs: if maxLengthToFilterSSRs == 0: print "filter SSRs" else: print "filter SSRs (<%ibp)" % ( maxLengthToFilterSSRs ) if filterHostGenes: print "filter host's genes" if filterNoCat: print "filter NoCat" if filterConfused: print "filter confused" sys.stdout.flush() inFile = open( inFileName, "r" ) outFile = open( outFileName, "w" ) bs = Bioseq() nbInSeq = 0 nbRmv = 0 while True: bs.read( inFile ) if bs.header == None: break nbInSeq += 1 if verbose > 1: print bs.header if filterSSRs == True and "SSR" in bs.header and ( maxLengthToFilterSSRs == 0 or bs.getLength() <= maxLengthToFilterSSRs ): nbRmv += 1 if verbose > 1: print "filtered !" elif filterHostGenes == True and "HostGene" in bs.header: nbRmv += 1 if verbose > 1: print "filtered !" elif filterConfused == True and "confusedness=yes" in bs.header: nbRmv += 1 if verbose > 1: print "filtered !" elif filterNoCat == True and "NoCat" in bs.header: algoMSA = "" for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]: if i in bs.header: algoMSA = i regexp = ".*" + algoMSA + "_(\d*)\|.*" header = re.match(regexp, bs.header) nb = header.group(1) nbAlignSeq = int( nb ) if nbAlignSeq <= nbAlignSeqNoCat: nbRmv += 1 if verbose > 1: print "filtered !" else: bs.write( outFile ) else: bs.write( outFile ) inFile.close() outFile.close() if verbose > 0: print "nb of input seq: %i" % ( nbInSeq ) print "nb of filtered seq: %i" % ( nbRmv ) sys.stdout.flush()