def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput): print("Run " + bam) # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() bedTree = bedToIntervallTree(bed) #evalHist = [0] * outFile = open(outputFile, "w") print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep = "\t", file=outFile) total = 0 correct = 0 correcPosWrongTC = 0 wrongPos = 0 minBaseQual = 0 for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: total += 1 simInfo = read.name.split("_") utrSim = simInfo[0] tcCountSim = int(simInfo[2]) utrFound = None if read.chromosome in bedTree: overlaps = list(bedTree[read.chromosome][read.startRefPos:read.endRefPos]) if len(overlaps) > 0: utrFound = overlaps[0].data if utrFound == utrSim: if tcCountSim == read.tcCount: correct += 1 else: correcPosWrongTC += 1 else: wrongPos += 1 print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep = "\t", file=outFile) print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total)
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput): print("Run " + bam) # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() bedTree = bedToIntervallTree(bed) #evalHist = [0] * outFile = open(outputFile, "w") print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep="\t", file=outFile) total = 0 correct = 0 correcPosWrongTC = 0 wrongPos = 0 minBaseQual = 0 for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: total += 1 simInfo = read.name.split("_") utrSim = simInfo[0] tcCountSim = int(simInfo[2]) utrFound = None if read.chromosome in bedTree: overlaps = list( bedTree[read.chromosome][read.startRefPos:read.endRefPos]) if len(overlaps) > 0: utrFound = overlaps[0].data if utrFound == utrSim: if tcCountSim == read.tcCount: correct += 1 else: correcPosWrongTC += 1 else: wrongPos += 1 print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep="\t", file=outFile) #print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total) print(correct, correcPosWrongTC, wrongPos, total)
def multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 utrIntervallTreeDict = bedToIntervallTree(bed) # debugLog = os.path.join("multimapdebug.log") # # fo = open(debugLog, "w") # Buffers for multimappers multimapBuffer = {} prevRead = "" # If read maps to another than previously recorded UTR -> do not dump reads to file dumpBuffer = True # This string tracks all multiple alignments multimapList = "" # logList = [] for read in infile: if(not read.is_secondary and not read.is_supplementary): if(read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 # First pass general filters if(read.is_unmapped): continue if(float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if(NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if (read.mapping_quality == 0) : # Previous read was also multimapper if (read.query_name != prevRead and prevRead != "") : #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) : if (dumpBuffer and len(multimapBuffer) > 0) : dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 # ret = dumpBufferToBam(multimapBuffer, outfile, infile) # print(ret,file = fo) #multimapBuffer = {} #multimapBuffer["nonUTR"] = [] # for entry in logList: # print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo) # logList = [] dumpBuffer = True multimapList = "" multimapBuffer = {} # Query Intervall tree for given chromosome for UTs chr = infile.getrname(read.reference_id) start = read.reference_start end = read.reference_end if (utrIntervallTreeDict.has_key(chr)) : query = utrIntervallTreeDict[chr][start:end] else : query = set() if len(query) > 0: # First UTR hit is recorded without checks if (len(multimapBuffer) == 0) : for result in query : if (not multimapBuffer.has_key(result.data)) : multimapBuffer[result.data] = [] multimapBuffer[result.data].append(read) # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR else : for result in query : if (not multimapBuffer.has_key(result.data)) : multimapBuffer[result.data] = [] multimapBuffer[result.data].append(read) dumpBuffer = False else : multimapBuffer[result.data].append(read) # else : # # If no overlap -> nonUTR # multimapBuffer["nonUTR"].append(read) # for result in query : # logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + result.data) # else : # logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + "OFF") multimapList = multimapList + chr + ":" + str(start) + "-" + str(end) + " " prevRead = read.query_name else : # Dump any multimappers before a unique mapper #if (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0) : if (len(multimapBuffer) > 0) : if (dumpBuffer) : dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 # ret = dumpBufferToBam(multimapBuffer, outfile, infile) # print(ret,file = fo) multimapBuffer = {} # for entry in logList: # print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo) # logList = [] #multimapBuffer["nonUTR"] = [] dumpBuffer = True multimapList = "" # Record all unique mappers prevRead = read.query_name outfile.write(read) filteredReads += 1 # Dump last portion if it was multimapper #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) : if (dumpBuffer and len(multimapBuffer) > 0) : dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 multimapper = mappedReads - filteredReads - idFiltered - nmFiltered print("Criterion\tFiltered reads",file=log) print("MQ < 0\t0",file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log) print("MM\t" + str(multimapper),file=log) # fo.close() return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper
def multimapUTRRetainment(infile, outfile, bed, minIdentity, NM, log): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 utrIntervallTreeDict = bedToIntervallTree(bed) # debugLog = os.path.join("multimapdebug.log") # # fo = open(debugLog, "w") # Buffers for multimappers multimapBuffer = {} prevRead = "" # If read maps to another than previously recorded UTR -> do not dump reads to file dumpBuffer = True # This string tracks all multiple alignments multimapList = "" # logList = [] for read in infile: if (not read.is_secondary and not read.is_supplementary): if (read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 # First pass general filters if (read.is_unmapped): continue if (float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if (NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if (read.mapping_quality == 0): # Previous read was also multimapper if (read.query_name != prevRead and prevRead != ""): #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) : if (dumpBuffer and len(multimapBuffer) > 0): dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 # ret = dumpBufferToBam(multimapBuffer, outfile, infile) # print(ret,file = fo) #multimapBuffer = {} #multimapBuffer["nonUTR"] = [] # for entry in logList: # print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo) # logList = [] dumpBuffer = True multimapList = "" multimapBuffer = {} # Query Intervall tree for given chromosome for UTs chr = infile.getrname(read.reference_id) start = read.reference_start end = read.reference_end if (chr in utrIntervallTreeDict): query = utrIntervallTreeDict[chr][start:end] else: query = set() if len(query) > 0: # First UTR hit is recorded without checks if (len(multimapBuffer) == 0): for result in query: if (not result.data in multimapBuffer): multimapBuffer[result.data] = [] multimapBuffer[result.data].append(read) # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR else: for result in query: if (not result.data in multimapBuffer): multimapBuffer[result.data] = [] multimapBuffer[result.data].append(read) dumpBuffer = False else: multimapBuffer[result.data].append(read) # else : # # If no overlap -> nonUTR # multimapBuffer["nonUTR"].append(read) # for result in query : # logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + result.data) # else : # logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + "OFF") multimapList = multimapList + chr + ":" + str(start) + "-" + str( end) + " " prevRead = read.query_name else: # Dump any multimappers before a unique mapper #if (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0) : if (len(multimapBuffer) > 0): if (dumpBuffer): dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 # ret = dumpBufferToBam(multimapBuffer, outfile, infile) # print(ret,file = fo) multimapBuffer = {} # for entry in logList: # print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo) # logList = [] #multimapBuffer["nonUTR"] = [] dumpBuffer = True multimapList = "" # Record all unique mappers prevRead = read.query_name outfile.write(read) filteredReads += 1 # Dump last portion if it was multimapper #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) : if (dumpBuffer and len(multimapBuffer) > 0): dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 multimapper = mappedReads - filteredReads - idFiltered - nmFiltered print("Criterion\tFiltered reads", file=log) print("MQ < 0\t0", file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered), file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log) print("MM\t" + str(multimapper), file=log) # fo.close() return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper
def multimapUTRRetainment(infile, outfile, bed, minIdentity, NM, MQ, log): mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered = 0, 0, 0, 0, 0, 0 utrIntervallTreeDict = bedToIntervallTree( bed) # Is interpreted now to represent any entity in the bed file # Buffers for multimappers multimapBuffer = {} prevRead = "" # If read maps to another than previously recorded UTR -> do not dump reads to file dumpBuffer = True # This string tracks all multiple alignments multimapList = "" for read in infile: # infile is AlignedFile according to pysam definition. Read is AlignedSegment if not read.is_secondary and not read.is_supplementary: if read.is_unmapped: unmappedReads += 1 else: mappedReads += 1 # First pass general filters if read.is_unmapped: continue if float(read.get_tag("XI")) < minIdentity: idFiltered += 1 continue if -1 < NM < int(read.get_tag("NM")): nmFiltered += 1 continue if read.mapping_quality < MQ: # Previous read was also multimapper if read.query_name != prevRead and prevRead != "": if dumpBuffer and len(multimapBuffer) > 0: dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 dumpBuffer = True multimapList = "" multimapBuffer = {} # Query Intervall tree for given chromosome for UTRs chr = infile.get_reference_name(read.reference_id) start = read.reference_start end = read.reference_end if chr in utrIntervallTreeDict: query = utrIntervallTreeDict[chr][ start: end] # This makes sure that mapping is in a bed region else: query = set() if len(query) > 0: # First UTR hit is recorded without checks if len(multimapBuffer) == 0: for result in query: if result.data not in multimapBuffer: multimapBuffer[result.data] = [] multimapBuffer[result.data].append(read) # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR else: for result in query: if result.data not in multimapBuffer: multimapBuffer[result.data] = [] multimapBuffer[result.data].append(read) dumpBuffer = False else: multimapBuffer[result.data].append(read) multimapList = multimapList + chr + ":" + str(start) + "-" + str( end) + " " prevRead = read.query_name else: # If read.mapping_quality > mq # Dump any multimappers before a unique mapper if len(multimapBuffer) > 0: if dumpBuffer: dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 multimapBuffer = {} dumpBuffer = True multimapList = "" # Record all unique mappers prevRead = read.query_name outfile.write(read) filteredReads += 1 # Dump last portion if it was multimapper if dumpBuffer and len(multimapBuffer) > 0: dumpBufferToBam(multimapBuffer, multimapList, outfile, infile) filteredReads += 1 multimapper = mappedReads - filteredReads - idFiltered - nmFiltered print("Criterion\tFiltered reads", file=log) print("MQ < 0\t0", file=log) print("ID < %s\t%s" % (minIdentity, idFiltered), file=log) print("NM > %s\t%s" % (NM, nmFiltered), file=log) print("MM\t%s" % multimapper, file=log) return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper