def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ): if outFileName == "": outFileName = inFileName + ".orf.map" outFile = open( outFileName, "w" ) bioseq = Bioseq() bioseqNb = 0 inFile = open( inFileName ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break bioseq.upCase() bioseqNb += 1 if verbose > 0: print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]' orf = bioseq.findORF() bestOrf = [] for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = orf[i][j-1] + 4 end = orf[i][j] + 3 if end - start >= orfMinLength: bestOrf.append( ( end-start, i+1, start, end ) ) bioseq.complement() orf = bioseq.findORF() seqLen = bioseq.getLength() for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = seqLen - orf[i][j-1] - 3 end = seqLen - orf[i][j] - 2 if start - end >= orfMinLength: bestOrf.append( ( start-end, (i+1)*-1, start, end ) ) bestOrf.sort() bestOrf.reverse() bestOrfNb = len(bestOrf) if orfMaxNb > bestOrfNb or orfMaxNb == 0 : orfMaxNb = bestOrfNb for i in xrange(0, orfMaxNb): if verbose > 0: print bestOrf[i] outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\ "|"+str(bestOrf[i][0]),bioseq.header, bestOrf[i][2],bestOrf[i][3])) inFile.close() outFile.close() return 0
def dbLengthFilter( len_min, inFileName, verbose=0 ): file_db = open( inFileName, "r" ) file_dbInf = open( inFileName+".Inf"+str(len_min), "w" ) file_dbSup = open( inFileName+".Sup"+str(len_min), "w" ) seq = Bioseq() numseq = 0 nbsave = 0 while True: seq.read( file_db ) if seq.sequence == None: break l = seq.getLength() numseq = numseq + 1 if l >= len_min: seq.write( file_dbSup ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' nbsave=nbsave+1 else: seq.write( file_dbInf ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' nbsave=nbsave+1 file_db.close() file_dbInf.close() file_dbSup.close() if verbose > 0: print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
def createSeqTable( self, tableName, fileName = "" ): sqlCmd = "CREATE TABLE %s (accession varchar(255), sequence longtext, description varchar(255), length int unsigned )" % (tableName) self.execute( sqlCmd ) self.createSeqIndex( tableName ) self.updateInfoTable( tableName, fileName ) if fileName != "": inFile = open( fileName ) tmpFileName = fileName.split("/")[-1] + ".tmp" + str(os.getpid()) tmpFile = open(tmpFileName, "w") bioseq = Bioseq() seqNb = 0 while True: bioseq.read( inFile ) if bioseq.sequence == None: break seqLen = bioseq.getLength() tmpFile.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ bioseq.sequence, bioseq.header, seqLen)) seqNb += 1 inFile.close() tmpFile.close() sqlCmd = "LOAD DATA LOCAL INFILE '%s' IGNORE INTO TABLE %s FIELDS ESCAPED BY ''" % \ (tmpFileName, tableName) self.execute( sqlCmd ) os.remove( tmpFileName )
def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0): if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists( inFileName ): print "ERROR: file '%s' doesn't exist" % ( inFileName ) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open( inFileName, "r" ) bs = Bioseq() countSeq = 0 while True: bs.read( inFileHandler ) if bs.header == None: break countSeq += 1 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) bs.appendBioseqInFile( tmpFile ) if verbose > 1: print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) bs.header = "" bs.sequence = "" inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists( outFileName ): os.remove( outFileName ) lFiles = glob.glob( "*bp_*nb" ) lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) for fileName in lFiles: cmd = "cat %s >> %s" % ( fileName, outFileName ) returnValue = os.system( cmd ) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName ) sys.exit(1) os.remove( fileName ) return 0
def getLengthPerSeqFromFile( inFile ): dHeader2Length = {} inFileHandler = open( inFile, "r" ) while True: iBs = Bioseq() iBs.read( inFileHandler ) if iBs.sequence == None: break dHeader2Length[ iBs.header ] = iBs.getLength() inFileHandler.close() return dHeader2Length
def filterClassifiedConsensus( self ): inFile = open( self._inFaFile, "r" ) outFile = open( self._outFaFile, "w" ) bs = Bioseq() nbInSeq = 0 nbRmv = 0 if self._classifFile != "": dHeader2Classif = self.getClassifPerHeaderOfUnclassifiedConsensus() while True: bs.read( inFile ) if bs.header == None: break nbInSeq += 1 if self._verbose > 1: print bs.header if self._filterSSRs and "SSR" in bs.header and ( self._maxLengthToFilterSSRs == 0 or bs.getLength() <= self._maxLengthToFilterSSRs ): nbRmv += 1 if self._verbose > 1: print "filtered SSR !" elif self._filterHostGenes and "HostGene" in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered HostGene !" elif self._filterConfused and "confused" in bs.header and "confusedness=no" not in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered confused !" elif self._filterNoCat != "0" and "NoCat" in bs.header: keep = False if "2" in self._filterNoCat: algoMSA = "" for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]: if i in bs.header: algoMSA = i nbAlignSeq = int( bs.header.split(algoMSA+"_")[1].split("|")[0] ) if nbAlignSeq > self._nbAlignSeqNoCat: keep = True if "3" in self._filterNoCat: for header in dHeader2Classif.keys(): if header in bs.header: if "no structural features" not in dHeader2Classif[header][6]: keep = True if keep: bs.write( outFile ) else: nbRmv += 1 if self._verbose > 1: print "filtered NoCat !" elif self._filterIncomplete and "completeness=incomp" in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered incomplete !" else: bs.write( outFile ) inFile.close() outFile.close() if self._verbose > 0: print "nb of input seq: %i" % ( nbInSeq ) print "nb of filtered seq: %i" % ( nbRmv ) sys.stdout.flush()