def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ): bsDB = BioseqDB( inFileName ) if verbose > 0: print "nb of input sequences: %i" % ( bsDB.getSize() ) if outFileName == "": outFileName = inFileName + ".best" + str(num) outFile = open( outFileName, "w" ) if bsDB.getSize()==0: return 0 num = int(num) if verbose > 0: print "keep the %i longest sequences" % ( num ) if minThresh > 0: print "with length > %i bp" % ( minThresh ) sys.stdout.flush() # retrieve the length of each input sequence tmpLSeqLgth = [] seqNum = 0 for bs in bsDB.db: seqNum += 1 tmpLSeqLgth.append( bs.getLength() ) if verbose > 1: print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() ) sys.stdout.flush() # sort the lengths tmpLSeqLgth.sort() tmpLSeqLgth.reverse() # select the longest lSeqLgth = [] for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): if tmpLSeqLgth[i] >= minThresh: lSeqLgth.append( tmpLSeqLgth[i] ) if verbose > 0: print "selected max length: %i" % ( max(lSeqLgth) ) print "selected min length: %i" % ( min(lSeqLgth) ) sys.stdout.flush() # save the longest inFile = open( inFileName ) seqNum = 0 nbSave = 0 for bs in bsDB.db: seqNum += 1 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: bs.write( outFile ) if verbose > 1: print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) sys.stdout.flush() nbSave += 1 if nbSave == num: break inFile.close() outFile.close() if verbose > 0: print nbSave, "saved sequences in ", outFileName sys.stdout.flush() return 0