def computeTconversions(ref, bed, snpsFile, bam, maxReadLength, minQual, outputCSV, outputBedgraphPlus, outputBedgraphMinus, conversionThreshold, log, mle=False): referenceFile = pysam.FastaFile(ref) sampleInfo = getSampleInfo(bam) slamseqInfo = SlamSeqInfo(bam) #readNumber = slamseqInfo.MappedReads readNumber = slamseqInfo.FilteredReads bedMD5 = md5(bed) if (mle): fileNameTest = replaceExtension(outputCSV, ".tsv", "_perread") fileTest = open(fileNameTest, 'w') print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=fileTest) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=fileTest) #print("utr", "n", "k", file=fileTest) print(SlamSeqInterval.Header, file=fileTest) fileCSV = open(outputCSV, 'w') print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=fileCSV) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=fileCSV) print(SlamSeqInterval.Header, file=fileCSV) snps = SNPtools.SNPDictionary(snpsFile) snps.read() #Go through one chr after the other testFile = SlamSeqBamFile(bam, ref, snps) if not testFile.bamVersion == __bam_version__: raise RuntimeError("Wrong filtered BAM file version detected (" + testFile.bamVersion + "). Expected version " + __bam_version__ + ". Please rerun slamdunk filter.") bedMD5 = md5(bed) if slamseqInfo.AnnotationMD5 != bedMD5: print( "Warning: MD5 checksum of annotation (" + bedMD5 + ") does not matched MD5 in filtered BAM files (" + slamseqInfo.AnnotationMD5 + "). Most probably the annotation filed changed after the filtered BAM files were created.", file=log) conversionBedGraph = {} for utr in BedIterator(bed): Tcontent = 0 slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, 0, 0, 0, 0, 0, 0, 0) slamSeqUtrMLE = SlamSeqInterval(utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, 0, 0, 0, 0, 0, 0, 0) if (not utr.hasStrand()): raise RuntimeError( "Input BED file does not contain stranded intervals.") if utr.start < 0: raise RuntimeError( "Negativ start coordinate found. Please check the following entry in your BED file: " + utr) # Retreive reference sequence region = utr.chromosome + ":" + str(utr.start + 1) + "-" + str( utr.stop) if (utr.chromosome in list(referenceFile.references)): #print(refRegion,file=sys.stderr) # pysam-0.15.0.1 #refSeq = referenceFile.fetch(region=region).upper() refSeq = referenceFile.fetch(reference=utr.chromosome, start=utr.start, end=utr.stop).upper() if (utr.strand == "-"): #refSeq = complement(refSeq[::-1]) Tcontent = refSeq.count("A") else: Tcontent = refSeq.count("T") slamSeqUtr._Tcontent = Tcontent readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual, conversionThreshold) tcCountUtr = [0] * utr.getLength() coverageUtr = [0] * utr.getLength() tInReads = [] tcInRead = [] countFwd = 0 tcCountFwd = 0 countRev = 0 tCountRev = 0 multiMapFwd = 0 multiMapRev = 0 for read in readIterator: # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions) if (not read.isTcRead): read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 if (read.direction == ReadDirection.Reverse): countRev += 1 if read.tcCount > 0: tCountRev += 1 if read.isMultimapper: multiMapRev += 1 else: countFwd += 1 if read.tcCount > 0: tcCountFwd += 1 if read.isMultimapper: multiMapFwd += 1 for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): tcCountUtr[mismatch.referencePosition] += 1 testN = read.getTcount() testk = 0 for mismatch in read.mismatches: if (mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): if (mismatch.isT(read.direction == ReadDirection.Reverse)): testN += 1 if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): testk += 1 #print(utr.name, read.name, read.direction, testN, testk, read.sequence, sep="\t") tInReads.append(testN) tcInRead.append(testk) #print(utr.name, testN, testk, sep="\t", file=fileTest) for i in xrange(read.startRefPos, read.endRefPos): if (i >= 0 and i < utr.getLength()): coverageUtr[i] += 1 if ((utr.strand == "+" and countFwd > 0) or (utr.strand == "-" and countRev > 0)): tcRateUtr = [ x * 100.0 / y if y > 0 else 0 for x, y in zip(tcCountUtr, coverageUtr) ] readCount = countFwd tcReadCount = tcCountFwd multiMapCount = multiMapFwd if (utr.strand == "-"): readCount = countRev tcReadCount = tCountRev multiMapCount = multiMapRev if ((utr.strand == "-" and countFwd > countRev) or (utr.strand == "+" and countRev > countFwd)): print( "Warning: " + utr.name + " is located on the " + utr.strand + " strand but read counts are higher for the opposite strand (fwd: " + countFwd + ", rev: " + countRev + ")", file=sys.stderr) refSeq = readIterator.getRefSeq() # Get number of covered Ts/As in the UTR and compute average conversion rate for all covered Ts/As coveredTcount = 0 avgConversationRate = 0 coveredPositions = 0 # Get number of reads on T positions and number of reads with T->C conversions on T positions coverageOnTs = 0 conversionsOnTs = 0 for position in xrange(0, len(coverageUtr)): if (coverageUtr[position] > 0 and ((utr.strand == "+" and refSeq[position] == "T") or (utr.strand == "-" and refSeq[position] == "A"))): coveredTcount += 1 avgConversationRate += tcRateUtr[position] coverageOnTs += coverageUtr[position] conversionsOnTs += tcCountUtr[position] conversionBedGraph[utr.chromosome + ":" + str(utr.start + position) + ":" + str(utr.strand)] = tcRateUtr[position] if (coverageUtr[position] > 0): coveredPositions += 1 if (coveredTcount > 0): avgConversationRate = avgConversationRate / coveredTcount else: avgConversationRate = 0 # reads per million mapped to the UTR readsCPM = 0 if (readNumber > 0): readsCPM = readCount * 1000000.0 / readNumber # Convert to SlamSeqInterval and print conversionRate = 0 if (coverageOnTs > 0): conversionRate = float(conversionsOnTs) / float(coverageOnTs) slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, readsCPM, coverageOnTs, conversionsOnTs, conversionRate, readCount, tcReadCount, multiMapCount) slamSeqUtrMLE = SlamSeqInterval( utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, readsCPM, coverageOnTs, conversionsOnTs, conversionRate, ",".join(str(x) for x in tInReads), ",".join(str(x) for x in tcInRead), multiMapCount) print(slamSeqUtr, file=fileCSV) if (mle): print(slamSeqUtrMLE, file=fileTest) fileCSV.close() if (mle): fileTest.close() fileBedgraphPlus = open(outputBedgraphPlus, 'w') fileBedgraphMinus = open(outputBedgraphMinus, 'w') for position in conversionBedGraph: positionData = position.split(":") if (positionData[2] == "+"): print(positionData[0], positionData[1], int(positionData[1]) + 1, conversionBedGraph[position], file=fileBedgraphPlus) else: print(positionData[0], positionData[1], int(positionData[1]) + 1, conversionBedGraph[position], file=fileBedgraphMinus) fileBedgraphPlus.close() fileBedgraphMinus.close() if (mle): fileNameMLE = replaceExtension(outputCSV, ".tsv", "_mle") callR( getPlotter("compute_conversion_rate_mle") + " -f " + fileNameTest + " -r " + "0.024" + " -o " + fileNameMLE + " &> /dev/null")
def addTcConversions(bed, readInFile, readOutFile, pulseTimePoint, chaseTimePoint, utrSummaryFile, conversionRate, librarySize, sampleInfo, labeledTranscripts=-1.0): # Read utrs from BED file utrs = parseUtrBedFile(bed) readOutTemp = readOutFile + "_tmp.sam" #bamheader = { 'HD': {'VN': '1.0'} } #readOutBAM = pysam.AlignmentFile(readOutTemp, "wb", header=bamheader, add_sq_text=False) readOutSAM = open(readOutTemp, "w") print("@HD\tVN:1.0\tSO:unsorted", file=readOutSAM) utrSummary = open(utrSummaryFile, "w") bedMD5 = md5(bed) print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=utrSummary) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=utrSummary) print(SlamSeqInterval.Header, file=utrSummary) reads = [] lastUtrName = None utrName = None fasta_sequences = SeqIO.parse(open(readInFile), 'fasta') for entry in fasta_sequences: # TODO: Uncomment to go back to pysam #with pysam.FastxFile(readInFile) as fh: #for entry in fh: #utrName = getUtrName(entry.name) utrName = getUtrName(entry.id) if (utrName == lastUtrName): reads.append(entry) elif (lastUtrName == None): reads.append(entry) else: readsCPM = len(reads) * 1000000.0 / librarySize readToConvertPercent = computeConversionRate( utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads( utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) reads = [] lastUtrName = utrName # Last UTR readsCPM = len(reads) * 1000000.0 / librarySize readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads( utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) readOutSAM.close() utrSummary.close() readOutTempBAM = readOutFile + "_tmp.bam" # Convert to BAM run("samtools view -Sb " + readOutTemp + " > " + readOutTempBAM) #samFile = pysam.AlignmentFile(readOutTemp, "r", check_header = False, check_sq = False) #bamFile = pysam.AlignmentFile(readOutTempBAM, "wb", template=samFile) #for read in samFile: # bamFile.write(read) #bamFile.close() #samFile.close() # Sort reads by query name (doesn't matter for mapping, but makes evaluation easier #pysam.sort("-o", readOutFile, readOutTempBAM) # @UndefinedVariable run("samtools sort -o " + readOutFile + " " + readOutTempBAM) os.unlink(readOutTemp) os.unlink(readOutTempBAM)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): if(printOnly or checkStep([inputBAM], [outputBAM], force)): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 multimapper = 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if (bed == None) : print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log) for read in infile: if(not read.is_secondary and not read.is_supplementary): if(read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 if(read.is_unmapped): continue if(read.mapping_quality < MQ): mqFiltered += 1 continue if(float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if(NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if(not read.is_secondary and not read.is_supplementary): filteredReads += 1 outfile.write(read) print("Criterion\tFiltered reads",file=log) print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log) print("MM\t0",file=log) else : # Multimap retention strategy filtering when bed is supplied random.seed(1) print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log) mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if (bed != None) : slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else : slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } if('PG' in inFileBamHeader): inFileBamHeader['PG'].append(slamDunkPG) else: inFileBamHeader['PG'] = [ slamDunkPG ] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, verbose) pysamIndex(outputBAM) #pysamFlagstat(outputBAM) #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) else: print("Skipped filtering for " + inputBAM, file=log)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): if (printOnly or checkStep([inputBAM], [outputBAM], force)): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 multimapper = 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if (bed == None): print("#No bed-file supplied. Running default filtering on " + inputBAM + ".", file=log) for read in infile: if (not read.is_secondary and not read.is_supplementary): if (read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 if (read.is_unmapped): continue if (read.mapping_quality < MQ): mqFiltered += 1 continue if (float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if (NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if (not read.is_secondary and not read.is_supplementary): filteredReads += 1 outfile.write(read) print("Criterion\tFiltered reads", file=log) print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered), file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log) print("MM\t0", file=log) else: # Multimap retention strategy filtering when bed is supplied random.seed(1) print( "#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".", file=log) mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment( infile, outfile, bed, minIdentity, NM, log) #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if (bed != None): slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else: slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } if ('PG' in inFileBamHeader): inFileBamHeader['PG'].append(slamDunkPG) else: inFileBamHeader['PG'] = [slamDunkPG] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, verbose) pysamIndex(outputBAM) #pysamFlagstat(outputBAM) #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) else: print("Skipped filtering for " + inputBAM, file=log)
def addTcConversions(bed, readInFile, readOutFile, pulseTimePoint, chaseTimePoint, utrSummaryFile, conversionRate, librarySize, sampleInfo, labeledTranscripts = -1.0): # Read utrs from BED file utrs = parseUtrBedFile(bed) readOutTemp = readOutFile + "_tmp.sam" #bamheader = { 'HD': {'VN': '1.0'} } #readOutBAM = pysam.AlignmentFile(readOutTemp, "wb", header=bamheader, add_sq_text=False) readOutSAM = open(readOutTemp, "w") print("@HD\tVN:1.0\tSO:unsorted", file=readOutSAM) utrSummary = open(utrSummaryFile, "w") bedMD5 = md5(bed) print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=utrSummary) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=utrSummary) print(SlamSeqInterval.Header, file=utrSummary) reads = [] lastUtrName = None utrName = None fasta_sequences = SeqIO.parse(open(readInFile),'fasta') for entry in fasta_sequences: # TODO: Uncomment to go back to pysam #with pysam.FastxFile(readInFile) as fh: #for entry in fh: #utrName = getUtrName(entry.name) utrName = getUtrName(entry.id) if(utrName == lastUtrName): reads.append(entry) elif(lastUtrName == None): reads.append(entry) else: readsCPM = len(reads) * 1000000.0 / librarySize; readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) reads = [] lastUtrName = utrName # Last UTR readsCPM = len(reads) * 1000000.0 / librarySize; readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) readOutSAM.close() utrSummary.close() readOutTempBAM = readOutFile + "_tmp.bam" # Convert to BAM run("samtools view -Sb " + readOutTemp + " > " + readOutTempBAM) #samFile = pysam.AlignmentFile(readOutTemp, "r", check_header = False, check_sq = False) #bamFile = pysam.AlignmentFile(readOutTempBAM, "wb", template=samFile) #for read in samFile: # bamFile.write(read) #bamFile.close() #samFile.close() # Sort reads by query name (doesn't matter for mapping, but makes evaluation easier #pysam.sort("-o", readOutFile, readOutTempBAM) # @UndefinedVariable run("samtools sort -o " + readOutFile + " " + readOutTempBAM) os.unlink(readOutTemp) os.unlink(readOutTempBAM)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False, paired=False): inputBAM = os.path.expanduser(inputBAM) outputBAM = os.path.expanduser(outputBAM) if printOnly or checkStep([inputBAM], [outputBAM], force): (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper) = 0, 0, 0, 0, 0, 0, 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if bed is None: print("#No bed-file supplied. Running default filtering on " + inputBAM + ".", file=log) if paired: read1 = None read2 = None for read in infile: if paired: if not read.is_paired or read.mate_is_unmapped or read.is_duplicate: unmappedReads += 1 continue if read.is_read2: read2 = read else: read1 = read read2 = None continue if not read.is_secondary and not read.is_supplementary: if read.is_unmapped: unmappedReads += 1 continue else: mappedReads += 1 if not paired: if read.mapping_quality < MQ: mqFiltered += 1 continue if float(read.get_tag("XI")) < minIdentity: idFiltered += 1 continue if -1 < NM < int(read.get_tag("NM")): nmFiltered += 1 continue filteredReads += 1 outfile.write(read) else: if read1 is None or read2 is None: continue if read1.query_name != read2.query_name: continue if read1.mapping_quality < MQ and read2.mapping_quality < MQ: mqFiltered += 1 continue if float(read1.get_tag("XI")) < minIdentity and float( read2.get_tag("XI")) < minIdentity: idFiltered += 1 continue if -1 < NM < int(read1.get_tag("NM")) and -1 < NM < int( read2.get_tag("NM")): nmFiltered += 1 continue filteredReads += 1 outfile.write(read1) outfile.write(read2) print("Criterion\tFiltered reads", file=log) print("MQ < 0\t0", file=log) print("ID < %s\t%s" % (minIdentity, idFiltered), file=log) print("NM > %s\t%s" % (NM, nmFiltered), file=log) print("MM\t0", file=log) else: # Multimap retention strategy filtering when bed is supplied print( "#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".", file=log) (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper) = multimapUTRRetainment(infile, outfile, bed, minIdentity, NM, MQ, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if "RG" in inFileBamHeader and len(inFileBamHeader["RG"]) > 0: slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if bed: slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else: slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader["RG"][0]["DS"] = str(slamseqInfo) slamDunkPG = { "ID": "slamdunk", "PN": "slamdunk filter v" + __version__, "VN": __bam_version__ } if "PG" in inFileBamHeader: inFileBamHeader["PG"].append(slamDunkPG) else: inFileBamHeader["PG"] = [slamDunkPG] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, paired=False, verbose=verbose) if not paired: pysamIndex(outputBAM) else: print("Skipped filtering for " + inputBAM, file=log)