def spliceFromCoords( genomeFile, coordFile, obsFile ): genomeFileHandler = open( genomeFile, "r" ) obsFileHandler = open( obsFile, "w" ) dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile ) while True: bs = Bioseq() bs.read( genomeFileHandler ) if bs.sequence == None: break if dChr2Maps.has_key( bs.header ): lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] ) splicedSeq = "" currentSite = 0 for iMap in lCoords: minSplice = iMap.getMin() - 1 if minSplice > currentSite: splicedSeq += bs.sequence[ currentSite : minSplice ] currentSite = iMap.getMax() splicedSeq += bs.sequence[ currentSite : ] bs.sequence = splicedSeq bs.write( obsFileHandler ) genomeFileHandler.close() obsFileHandler.close()
def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ): if pattern == "": return patternToSearch = re.compile(pattern) if outFileName == "": outFileName = inFileName + '.cleaned' outFile = open(outFileName,'w') bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 inFile = open(inFileName) while True: bioseq.read(inFile) if bioseq.sequence == None: break bioseqNb += 1 if not patternToSearch.search(bioseq.header): bioseq.write(outFile) if verbose > 1: print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!' savedBioseqNb += 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ): if pattern == "": return if outFileName == "": outFileName = inFileName + '.extracted' outFile = open( outFileName, 'w' ) patternTosearch = re.compile( pattern ) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 inFile = open( inFileName, "r" ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break bioseqNb = bioseqNb + 1 m = patternTosearch.search( bioseq.header ) if m: bioseq.write( outFile ) if verbose > 1: print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!' savedBioseqNb = savedBioseqNb + 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def dbLengthFilter( len_min, inFileName, verbose=0 ): file_db = open( inFileName, "r" ) file_dbInf = open( inFileName+".Inf"+str(len_min), "w" ) file_dbSup = open( inFileName+".Sup"+str(len_min), "w" ) seq = Bioseq() numseq = 0 nbsave = 0 while True: seq.read( file_db ) if seq.sequence == None: break l = seq.getLength() numseq = numseq + 1 if l >= len_min: seq.write( file_dbSup ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' nbsave=nbsave+1 else: seq.write( file_dbInf ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' nbsave=nbsave+1 file_db.close() file_dbInf.close() file_dbSup.close() if verbose > 0: print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
def read( self, faFileHandler ): while True: seq = Bioseq() seq.read( faFileHandler ) if seq.sequence == None: break self.add( seq )
def createSeqTable( self, tableName, fileName = "" ): sqlCmd = "CREATE TABLE %s (accession varchar(255), sequence longtext, description varchar(255), length int unsigned )" % (tableName) self.execute( sqlCmd ) self.createSeqIndex( tableName ) self.updateInfoTable( tableName, fileName ) if fileName != "": inFile = open( fileName ) tmpFileName = fileName.split("/")[-1] + ".tmp" + str(os.getpid()) tmpFile = open(tmpFileName, "w") bioseq = Bioseq() seqNb = 0 while True: bioseq.read( inFile ) if bioseq.sequence == None: break seqLen = bioseq.getLength() tmpFile.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ bioseq.sequence, bioseq.header, seqLen)) seqNb += 1 inFile.close() tmpFile.close() sqlCmd = "LOAD DATA LOCAL INFILE '%s' IGNORE INTO TABLE %s FIELDS ESCAPED BY ''" % \ (tmpFileName, tableName) self.execute( sqlCmd ) os.remove( tmpFileName )
def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ): if outFileName == "": outFileName = inFileName + ".orf.map" outFile = open( outFileName, "w" ) bioseq = Bioseq() bioseqNb = 0 inFile = open( inFileName ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break bioseq.upCase() bioseqNb += 1 if verbose > 0: print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]' orf = bioseq.findORF() bestOrf = [] for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = orf[i][j-1] + 4 end = orf[i][j] + 3 if end - start >= orfMinLength: bestOrf.append( ( end-start, i+1, start, end ) ) bioseq.complement() orf = bioseq.findORF() seqLen = bioseq.getLength() for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = seqLen - orf[i][j-1] - 3 end = seqLen - orf[i][j] - 2 if start - end >= orfMinLength: bestOrf.append( ( start-end, (i+1)*-1, start, end ) ) bestOrf.sort() bestOrf.reverse() bestOrfNb = len(bestOrf) if orfMaxNb > bestOrfNb or orfMaxNb == 0 : orfMaxNb = bestOrfNb for i in xrange(0, orfMaxNb): if verbose > 0: print bestOrf[i] outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\ "|"+str(bestOrf[i][0]),bioseq.header, bestOrf[i][2],bestOrf[i][3])) inFile.close() outFile.close() return 0
def getLengthPerSeqFromFile( inFile ): dHeader2Length = {} inFileHandler = open( inFile, "r" ) while True: iBs = Bioseq() iBs.read( inFileHandler ) if iBs.sequence == None: break dHeader2Length[ iBs.header ] = iBs.getLength() inFileHandler.close() return dHeader2Length
def extractBioseqListFromFastaFile( fileName ): file = open( fileName ) lBioseq = [] currentHeader = "" while currentHeader != None: bioseq = Bioseq() bioseq.read(file) currentHeader = bioseq.header if currentHeader != None: lBioseq.append(bioseq) return lBioseq
def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): if patternFileName == "": print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] inFile = open( inFileName, "r" ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break bioseqNb += 1 lHeaders.append( bioseq.header ) inFile.close() patternFile = open( patternFileName, "r") lHeadersToRemove = [] for pattern in patternFile: if verbose > 0: print "pattern: ",pattern[:-1]; sys.stdout.flush() patternToSearch = re.compile( pattern[:-1] ) for h in lHeaders: if patternToSearch.search(h): lHeadersToRemove.append(h) patternFile.close() if outFileName == "": outFileName = inFileName + '.cleaned' outFile = open( outFileName, 'w' ) bioseqNum = 0 inFile=open( inFileName ) while True: bioseq.read( inFile ) bioseqNum += 1 if bioseq.sequence == None: break if bioseq.header not in lHeadersToRemove: bioseq.write( outFile ) if verbose > 1: print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() savedBioseqNb += 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): if patternFileName == "": print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] inFile = open( inFileName, "r" ) while True: bioseq.read( inFile ) if bioseq.sequence == None: break lHeaders.append( bioseq.header ) inFile.close() lHeadersToKeep = [] patternFile = open( patternFileName, "r" ) for pattern in patternFile: if verbose > 0: print "pattern: ",pattern[:-1]; sys.stdout.flush() patternToSearch = re.compile(pattern[:-1]) for h in lHeaders: if patternToSearch.search(h): lHeadersToKeep.append(h) patternFile.close() if outFileName == "": outFileName = inFileName + ".extracted" outFile=open( outFileName, "w" ) inFile = open( inFileName, "r" ) while True: bioseq.read(inFile) if bioseq.sequence == None: break bioseqNb += 1 if bioseq.header in lHeadersToKeep: bioseq.write(outFile) if verbose > 1: print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() savedBioseqNb += 1 inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
def extractPatternOfFile(self, pattern, inFileName): if pattern=="" : return srch=re.compile(pattern) file_db=open(inFileName) numseq=0 nbsave=0 while 1: seq=Bioseq() seq.read(file_db) if seq.sequence==None: break numseq+=1 m=srch.search(seq.header) if m: self.add(seq) nbsave+=1 file_db.close()
def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0): if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists( inFileName ): print "ERROR: file '%s' doesn't exist" % ( inFileName ) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open( inFileName, "r" ) bs = Bioseq() countSeq = 0 while True: bs.read( inFileHandler ) if bs.header == None: break countSeq += 1 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) bs.appendBioseqInFile( tmpFile ) if verbose > 1: print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) bs.header = "" bs.sequence = "" inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists( outFileName ): os.remove( outFileName ) lFiles = glob.glob( "*bp_*nb" ) lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) for fileName in lFiles: cmd = "cat %s >> %s" % ( fileName, outFileName ) returnValue = os.system( cmd ) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName ) sys.exit(1) os.remove( fileName ) return 0
def filterClassifiedConsensus( self ): inFile = open( self._inFaFile, "r" ) outFile = open( self._outFaFile, "w" ) bs = Bioseq() nbInSeq = 0 nbRmv = 0 if self._classifFile != "": dHeader2Classif = self.getClassifPerHeaderOfUnclassifiedConsensus() while True: bs.read( inFile ) if bs.header == None: break nbInSeq += 1 if self._verbose > 1: print bs.header if self._filterSSRs and "SSR" in bs.header and ( self._maxLengthToFilterSSRs == 0 or bs.getLength() <= self._maxLengthToFilterSSRs ): nbRmv += 1 if self._verbose > 1: print "filtered SSR !" elif self._filterHostGenes and "HostGene" in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered HostGene !" elif self._filterConfused and "confused" in bs.header and "confusedness=no" not in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered confused !" elif self._filterNoCat != "0" and "NoCat" in bs.header: keep = False if "2" in self._filterNoCat: algoMSA = "" for i in ["Map","MAP","Malign","Mafft","Prank","Clustalw","Muscle","Tcoffee"]: if i in bs.header: algoMSA = i nbAlignSeq = int( bs.header.split(algoMSA+"_")[1].split("|")[0] ) if nbAlignSeq > self._nbAlignSeqNoCat: keep = True if "3" in self._filterNoCat: for header in dHeader2Classif.keys(): if header in bs.header: if "no structural features" not in dHeader2Classif[header][6]: keep = True if keep: bs.write( outFile ) else: nbRmv += 1 if self._verbose > 1: print "filtered NoCat !" elif self._filterIncomplete and "completeness=incomp" in bs.header: nbRmv += 1 if self._verbose > 1: print "filtered incomplete !" else: bs.write( outFile ) inFile.close() outFile.close() if self._verbose > 0: print "nb of input seq: %i" % ( nbInSeq ) print "nb of filtered seq: %i" % ( nbRmv ) sys.stdout.flush()