def mergeFile( inFile, outFile="" ): if outFile == "": outFile = "%s.merged" % ( inFile ) if os.path.exists( outFile ): os.remove( outFile ) tmpFile = "%s.sorted" % ( inFile ) AlignUtils.sortAlignFile( inFile, tmpFile ) tmpF = open( tmpFile, "r" ) dQrySbj2Aligns = {} prevPairQrySbj = "" while True: line = tmpF.readline() if line == "": break iAlign = Align() iAlign.setFromString( line ) pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() ) if not dQrySbj2Aligns.has_key( pairQrySbj ): if prevPairQrySbj != "": lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) AlignUtils.writeListInFile( lMerged, outFile, "a" ) del dQrySbj2Aligns[ prevPairQrySbj ] prevPairQrySbj = pairQrySbj else: prevPairQrySbj = pairQrySbj dQrySbj2Aligns[ pairQrySbj ] = [] dQrySbj2Aligns[ pairQrySbj ].append( iAlign ) lMerged = [] if len(dQrySbj2Aligns.keys()) > 0: lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) AlignUtils.writeListInFile( lMerged, outFile, "a" ) tmpF.close() os.remove( tmpFile )
def getAlignListFromFile( inFile ): lAlignInstances = [] inFileHandler = open( inFile, "r" ) while True: line = inFileHandler.readline() if line == "": break a = Align() a.setFromString( line ) lAlignInstances.append( a ) inFileHandler.close() return lAlignInstances
def getScoreListFromFile( inFile ): lScores = [] inFileHandler = open( inFile, "r" ) iAlign = Align() while True: line = inFileHandler.readline() if line == "": break iAlign.reset() iAlign.setFromString( line ) lScores.append( iAlign.score ) inFileHandler.close() return lScores
def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ): alignFileHandler = open( alignFile, "r" ) mapFileHandler = open( mapFile, "w" ) iAlign = Align() while True: line = alignFileHandler.readline() if line == "": break iAlign.setFromString( line ) iMapQ = iAlign.getSubjectAsMapOfQuery() iMapQ.write( mapFileHandler ) alignFileHandler.close() mapFileHandler.close()
def convertAlignFileIntoPathFile( alignFile, pathFile ): alignFileHandler = open( alignFile, "r" ) pathFileHandler = open( pathFile, "w" ) iAlign = Align() countAlign = 0 while True: line = alignFileHandler.readline() if line == "": break countAlign += 1 iAlign.setFromString( line, "\t" ) pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) ) alignFileHandler.close() pathFileHandler.close()
def getAlignInstance(self): iAlign = Align() lAttributes = [] lAttributes.append( self.range_query.seqname ) lAttributes.append( self.range_query.start ) lAttributes.append( self.range_query.end ) lAttributes.append( self.range_subject.seqname ) lAttributes.append( self.range_subject.start ) lAttributes.append( self.range_subject.end ) lAttributes.append( self.e_value ) lAttributes.append( self.score ) lAttributes.append( self.identity ) iAlign.setFromTuple( lAttributes ) return iAlign
def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ): alignFileHandler = open( alignFile, "r" ) mapFileHandler = open( mapFile, "w" ) iAlign = Align() while True: line = alignFileHandler.readline() if line == "": break iAlign.setFromString( line ) iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject() iMapQ.write( mapFileHandler ) iMapS.write( mapFileHandler ) alignFileHandler.close() mapFileHandler.close()
def filterRedundantMatches( inFile, outFile ): """ When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks), one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...' and we discards 'chunk7-11-110-chunk3-1-100-...'. Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards 'chunk5-11-110-chunk5-1-100-...'. For this of course the results need to be sorted by query, on plus strand, and in ascending coordinates (always the case with Blaster). """ inFileHandler = open( inFile, "r" ) outFileHandler = open( outFile, "w" ) iAlign = Align() countMatches = 0 tick = 100000 while True: line = inFileHandler.readline() if line == "": break countMatches += 1 iAlign.setFromString( line ) if "chunk" not in iAlign.range_query.seqname \ or "chunk" not in iAlign.range_subject.seqname: print "ERROR: 'chunk' not in seqname" sys.exit(1) if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]): iAlign.write( outFileHandler ) elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]): if iAlign.range_query.getMin() < iAlign.range_subject.getMin(): iAlign.write( outFileHandler ) if countMatches % tick == 0: # need to free buffer frequently as file can be big outFileHandler.flush() os.fsync( outFileHandler.fileno() ) inFileHandler.close() outFileHandler.close()
def retrieveInitialSequenceHeadersForAlignFile( self, dNew2Init ): inFileHandler = open( self._inFile, "r" ) outFileHandler = open( self._outFile, "w" ) a = Align() while True: line = inFileHandler.readline() if line == "": break a.setFromTuple( line.split("\t") ) nameToBeReplaced = a.range_query.seqname if dNew2Init.has_key( nameToBeReplaced ): a.range_query.seqname = dNew2Init[ nameToBeReplaced ] nameToBeReplaced = a.range_subject.seqname if dNew2Init.has_key( nameToBeReplaced ): a.range_subject.seqname = dNew2Init[ nameToBeReplaced ] a.write( outFileHandler ) inFileHandler.close() outFileHandler.close()
def updateScoresInFile( inFile, outFile ): inHandler = open( inFile, "r" ) outHandler = open( outFile, "w" ) iAlign = Align() while True: line = inHandler.readline() if line == "": break iAlign.reset() iAlign.setFromString( line, "\t" ) iAlign.updateScore() iAlign.write( outHandler ) inHandler.close() outHandler.close()
def toString(self): string = "%i" % ( self.id ) string += "\t%s" % (Align.toString(self)) return string
def reset(self): self.id = -1 Align.reset(self)
def setFromTuple(self, tuple): self.id = int(tuple[0]) Align.setFromTuple(self, tuple[1:])
def __eq__(self, o): if self.id != o.id: return False else: return Align.__eq__(self, o)
def __init__( self, id=-1, range_q=Range(), range_s=Range(), e_value=0, score=0, identity=0 ): self.id = int( id ) Align.__init__( self, range_q, range_s, e_value, score, identity )