def filterRedundantMatches( inFile, outFile ): """ When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks), one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...' and we discards 'chunk7-11-110-chunk3-1-100-...'. Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards 'chunk5-11-110-chunk5-1-100-...'. For this of course the results need to be sorted by query, on plus strand, and in ascending coordinates (always the case with Blaster). """ inFileHandler = open( inFile, "r" ) outFileHandler = open( outFile, "w" ) iAlign = Align() countMatches = 0 tick = 100000 while True: line = inFileHandler.readline() if line == "": break countMatches += 1 iAlign.setFromString( line ) if "chunk" not in iAlign.range_query.seqname \ or "chunk" not in iAlign.range_subject.seqname: print "ERROR: 'chunk' not in seqname" sys.exit(1) if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]): iAlign.write( outFileHandler ) elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]): if iAlign.range_query.getMin() < iAlign.range_subject.getMin(): iAlign.write( outFileHandler ) if countMatches % tick == 0: # need to free buffer frequently as file can be big outFileHandler.flush() os.fsync( outFileHandler.fileno() ) inFileHandler.close() outFileHandler.close()
def updateScoresInFile( inFile, outFile ): inHandler = open( inFile, "r" ) outHandler = open( outFile, "w" ) iAlign = Align() while True: line = inHandler.readline() if line == "": break iAlign.reset() iAlign.setFromString( line, "\t" ) iAlign.updateScore() iAlign.write( outHandler ) inHandler.close() outHandler.close()
def retrieveInitialSequenceHeadersForAlignFile( self, dNew2Init ): inFileHandler = open( self._inFile, "r" ) outFileHandler = open( self._outFile, "w" ) a = Align() while True: line = inFileHandler.readline() if line == "": break a.setFromTuple( line.split("\t") ) nameToBeReplaced = a.range_query.seqname if dNew2Init.has_key( nameToBeReplaced ): a.range_query.seqname = dNew2Init[ nameToBeReplaced ] nameToBeReplaced = a.range_subject.seqname if dNew2Init.has_key( nameToBeReplaced ): a.range_subject.seqname = dNew2Init[ nameToBeReplaced ] a.write( outFileHandler ) inFileHandler.close() outFileHandler.close()