def statsComputeOverallRates(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init totalRatesFwd = [0] * 25 totalRatesRev = [0] * 25 tcCount = [0] * 100 # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: # Compute rates for current read rates = read.conversionRates # Get T -> C conversions for current read tc = read.tcCount tcCount[tc] += 1 # Add rates from read to total rates if (read.direction == ReadDirection.Reverse): totalRatesRev = sumLists(totalRatesRev, rates) else: totalRatesFwd = sumLists(totalRatesFwd, rates) # Print rates in correct format for plotting fo = open(outputCSV, "w") print("# slamdunk rates v" + __version__, file=fo) printRates(totalRatesFwd, totalRatesRev, fo) fo.close() if (not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: #f = tempfile.NamedTemporaryFile(delete=False) #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f) #f.close() callR(getPlotter("compute_overall_rates") + " -f " + outputCSV + " -n " + removeExtension(os.path.basename(bam)) + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2" , outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False): if(quantseqMapping is True) : parameter = "--no-progress" if(trim5p > 0): parameter = parameter + " -5 " + str(trim5p) if(maxPolyA > -1): parameter = parameter + " --max-polya " + str(maxPolyA) if(endtoendMapping is True): parameter = parameter + " -e " else: parameter = parameter + " -l " if(sampleId != None): parameter = parameter + " --rg-id " + str(sampleId) if(sampleName != ""): parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(sampleTime) if(topn > 1): parameter = parameter + " -n " + str(topn) + " --strata " if(checkStep([inputReference, inputBAM], [replaceExtension(outputSAM, ".bam")], force)): if outputSAM.endswith(".sam"): # Output SAM run(getBinary("ngm") + " -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly) else: # Output BAM directly run(getBinary("ngm") + " -b -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly) else: print("Skipped mapping for " + inputBAM, file=log)
def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: snps = SNPtools.SNPDictionary(snpsFile) snps.read() outputFile = SlamSeqWriter(outputCSV) #Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome) for read in readIterator: outputFile.write(read) outputFile.close()
def SNPs( inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual, log, printOnly=False, verbose=True, force=False ): files = [os.path.expanduser(p) for p in [inputBAM, referenceFile]] if checkStep(files, [os.path.expanduser(outputSNP)], force): fileSNP = open(outputSNP, "w") mpileupCmd = "samtools mpileup -B -A -f %s %s" % (referenceFile, inputBAM) if verbose: print(mpileupCmd, file=log) if not printOnly: mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log) varscanCmd = "varscan mpileup2snp --strand-filter 0 --output-vcf " \ "--min-var-freq %s --min-coverage %s --variants 1" % (str(minVarFreq), str(minCov),) if verbose: print(varscanCmd, file=log) if not printOnly: varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log) varscan.wait() fileSNP.close() else: print("Skipping SNP calling", file=log)
def statsComputeOverallRates(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init totalRatesFwd = [0] * 25 totalRatesRev = [0] * 25 tcCount = [0] * 100 # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: # Compute rates for current read rates = read.conversionRates # Get T -> C conversions for current read tc = read.tcCount tcCount[tc] += 1 # Add rates from read to total rates if(read.direction == ReadDirection.Reverse): totalRatesRev = sumLists(totalRatesRev, rates) else: totalRatesFwd = sumLists(totalRatesFwd, rates) # Print rates in correct format for plotting fo = open(outputCSV, "w") print("# slamdunk rates v" + __version__, file=fo) printRates(totalRatesFwd, totalRatesRev, fo) fo.close() if(not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: #f = tempfile.NamedTemporaryFile(delete=False) #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f) #f.close() callR(getPlotter("compute_overall_rates") + " -f " + outputCSV + " -n " + removeExtension(os.path.basename(bam)) + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2", outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False, isPaired=False): if quantseqMapping: parameter = "--no-progress" if trim5p > 0: parameter = parameter + " -5 " + str(trim5p) if maxPolyA > -1: parameter = parameter + " --max-polya " + str(maxPolyA) if endtoendMapping: parameter = parameter + " -e " else: parameter = parameter + " -l " if sampleId is not None: parameter = parameter + " --rg-id " + str(sampleId) if sampleName != "": parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str( sampleTime) if topn > 1: parameter = parameter + " -n " + str(topn) + " --strata " files = [inputReference] files.append(inputBAM) if not isPaired else files.extend(inputBAM) files = [os.path.expanduser(p) for p in files] if checkStep(files, [replaceExtension(outputSAM, ".bam")], force): cmd = "ngm %s -r %s %s -t %s %s -o %s" % ( "" if outputSAM.endswith(".sam") else "-b", files[0], "-q %s" % files[1] if not isPaired else "-1 %s -2 %s" % (files[1], files[2]), threads, parameter, outputSAM) run(cmd, log, verbose=verbose, dry=printOnly) else: print("Skipped mapping for " + inputBAM if not isPaired else inputBAM[0], file=log)
def SNPs(inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual, log, printOnly=False, verbose=True, force=False): if(checkStep([inputBAM, referenceFile], [outputSNP], force)): fileSNP = open(outputSNP, 'w') mpileupCmd = getBinary("samtools") + " mpileup -B -A -f " + referenceFile + " " + inputBAM if(verbose): print(mpileupCmd, file=log) if(not printOnly): mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log) varscanCmd = "java -jar " + getBinary("VarScan.v2.4.1.jar") + " mpileup2snp --strand-filter 0 --output-vcf --min-var-freq " + str(minVarFreq) + " --min-coverage " + str(minCov) + " --variants 1" if(verbose): print(varscanCmd, file=log) if(not printOnly): varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log) varscan.wait() fileSNP.close() else: print("Skipping SNP calling", file=log)
def sort(inputSAM, outputBAM, log, threads=1, keepSam=True, dry=False, verbose=True): if (files_exist(inputSAM) and checkStep([inputSAM], [outputBAM + ".bai"])): runSam2bam(inputSAM, outputBAM, log, False, False, not keepSam, threads=threads, dry=dry, verbose=verbose) else: print("Skipped sorting for " + inputSAM, file=log)
def sort(inputSAM, outputBAM, log, threads=1, keepSam=True, dry=False, verbose=True, isPaired=False): if files_exist(inputSAM) and checkStep([inputSAM], [outputBAM + ".bai"]): runSam2bam(inputSAM, outputBAM, log, index=False, sort="name" if isPaired else None, delinFile=not keepSam, onlyProperPaired=True, threads=threads, dry=dry, verbose=verbose) else: print("Skipped sorting for " + inputSAM, file=log)
def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: snps = SNPtools.SNPDictionary(snpsFile) snps.read() outputFile = SlamSeqWriter(outputCSV) #Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome) for read in readIterator: outputFile.write(read) outputFile.close()
def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose=True, force=False): if (printOnly or checkStep([inputBAM], [outputBAM], force)): samfile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile) processedReads = 0 retainedReads = 0 prevChr = "" prevStart = "" duplicateBuffer = {} for read in samfile: flag = read.cigarstring chr = read.reference_id start = read.reference_start seq = read.query_sequence if (read.has_tag("TC")): tcflag = read.get_tag("TC") else: tcflag = 0 if (tcflag >= tcMutations): if (chr != prevChr or start != prevStart): if (prevChr != ""): for curSeq in duplicateBuffer: for curFlag in duplicateBuffer[curSeq]: for readEntry in duplicateBuffer[curSeq][ curFlag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() if not seq in duplicateBuffer: duplicateBuffer[seq] = {} if not flag in duplicateBuffer[seq]: duplicateBuffer[seq][flag] = list() if len(duplicateBuffer[seq][flag]) > 0: read.is_duplicate = True duplicateBuffer[seq][flag].append(read) prevChr = chr prevStart = start processedReads += 1 for seq in duplicateBuffer: for flag in duplicateBuffer[seq]: for readEntry in duplicateBuffer[seq][flag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() outfile.close() print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end="") print("{0:.2f}".format(float(retainedReads) / float(processedReads)), file=log, end="") print(" compression rate)", file=log) pysamIndex(outputBAM) else: print("Skipped deduplication for " + inputBAM, file=log)
def computeSNPMaskedRates(ref, bed, snpsFile, bam, maxReadLength, minQual, coverageCutoff, variantFraction, outputCSV, outputPDF, strictTCs, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, ref], [outputCSV], force)): print("Skipped computing T->C per UTR with SNP masking for file " + bam, file=log) else: fileCSV = open(outputCSV, 'w') snps = SNPtools.SNPDictionary(snpsFile) snps.read() #Go through one chr after the other testFile = SlamSeqBamFile(bam, ref, snps) progress = 0 for utr in BedIterator(bed): if (not utr.hasStrand()): raise RuntimeError( "Input BED file does not contain stranded intervals.") if utr.start < 0: raise RuntimeError( "Negativ start coordinate found. Please check the following entry in your BED file: " + utr) readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual) unmaskedTCCount = 0 maskedTCCount = 0 readCount = 0 for read in readIterator: # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions) if (not read.isTcRead and strictTCs): read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 isTC = False isTrueTC = False for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): isTrueTC = True unmasked = False if (read.direction == ReadDirection.Reverse and mismatch.referenceBase == "A" and mismatch.readBase == "G"): unmasked = True elif (read.direction != ReadDirection.Reverse and mismatch.referenceBase == "T" and mismatch.readBase == "C"): unmasked = True if (unmasked and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): isTC = True readCount += 1 if (isTC): unmaskedTCCount += 1 if (isTrueTC): maskedTCCount += 1 containsSNP = 0 if (unmaskedTCCount != maskedTCCount): containsSNP = 1 print(utr.name + "\t" + str(readCount) + "\t" + str(unmaskedTCCount) + "\t" + str(maskedTCCount) + "\t" + str(containsSNP), file=fileCSV) progress += 1 fileCSV.close() if (not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per UTR position plot for file " + bam, file=log) else: callR(getPlotter("SNPeval") + " -i " + outputCSV + " -c " + str(coverageCutoff) + " -v " + str(variantFraction) + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False, paired=False): inputBAM = os.path.expanduser(inputBAM) outputBAM = os.path.expanduser(outputBAM) if printOnly or checkStep([inputBAM], [outputBAM], force): (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper) = 0, 0, 0, 0, 0, 0, 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if bed is None: print("#No bed-file supplied. Running default filtering on " + inputBAM + ".", file=log) if paired: read1 = None read2 = None for read in infile: if paired: if not read.is_paired or read.mate_is_unmapped or read.is_duplicate: unmappedReads += 1 continue if read.is_read2: read2 = read else: read1 = read read2 = None continue if not read.is_secondary and not read.is_supplementary: if read.is_unmapped: unmappedReads += 1 continue else: mappedReads += 1 if not paired: if read.mapping_quality < MQ: mqFiltered += 1 continue if float(read.get_tag("XI")) < minIdentity: idFiltered += 1 continue if -1 < NM < int(read.get_tag("NM")): nmFiltered += 1 continue filteredReads += 1 outfile.write(read) else: if read1 is None or read2 is None: continue if read1.query_name != read2.query_name: continue if read1.mapping_quality < MQ and read2.mapping_quality < MQ: mqFiltered += 1 continue if float(read1.get_tag("XI")) < minIdentity and float( read2.get_tag("XI")) < minIdentity: idFiltered += 1 continue if -1 < NM < int(read1.get_tag("NM")) and -1 < NM < int( read2.get_tag("NM")): nmFiltered += 1 continue filteredReads += 1 outfile.write(read1) outfile.write(read2) print("Criterion\tFiltered reads", file=log) print("MQ < 0\t0", file=log) print("ID < %s\t%s" % (minIdentity, idFiltered), file=log) print("NM > %s\t%s" % (NM, nmFiltered), file=log) print("MM\t0", file=log) else: # Multimap retention strategy filtering when bed is supplied print( "#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".", file=log) (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper) = multimapUTRRetainment(infile, outfile, bed, minIdentity, NM, MQ, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if "RG" in inFileBamHeader and len(inFileBamHeader["RG"]) > 0: slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if bed: slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else: slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader["RG"][0]["DS"] = str(slamseqInfo) slamDunkPG = { "ID": "slamdunk", "PN": "slamdunk filter v" + __version__, "VN": __bam_version__ } if "PG" in inFileBamHeader: inFileBamHeader["PG"].append(slamDunkPG) else: inFileBamHeader["PG"] = [slamDunkPG] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, paired=False, verbose=verbose) if not paired: pysamIndex(outputBAM) else: print("Skipped filtering for " + inputBAM, file=log)
def tcPerUtr(referenceFile, utrBed, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per UTR position for file " + bam, file=log) else: counter = 0 totalUtrCountFwd = [0] * utrNormFactor totalUtrCountRev = [0] * utrNormFactor tcPerPosRev = [0] * utrNormFactor tcPerPosFwd = [0] * utrNormFactor allPerPosRev = [0] * utrNormFactor allPerPosFwd = [0] * utrNormFactor snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one utr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) for utr in BedIterator(utrBed): readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual) tcForwardCounts = [0] * utrNormFactor mutForwardCounts = [0] * utrNormFactor tcReverseCounts = [0] * utrNormFactor mutReverseCounts = [0] * utrNormFactor for read in readIterator: tcCounts = [0] * utrNormFactor mutCounts = [0] * utrNormFactor for mismatch in read.mismatches: mismatchPos = mismatch.referencePosition # mismatchPos = read.startRefPos if (utr.strand == "+"): # New try for UTRs (remove + 1 if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength()): # if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength() + 1) : mismatchPos = utrNormFactor - (utr.getLength() - mismatchPos) if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): tcCounts[mismatchPos] += 1 else: mutCounts[mismatchPos] += 1 else: if (mismatchPos >= 0 and mismatchPos < min( utr.getLength(), utrNormFactor)): if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): tcCounts[mismatchPos] += 1 else: mutCounts[mismatchPos] += 1 if (read.direction == ReadDirection.Reverse): tcReverseCounts = sumLists(tcReverseCounts, tcCounts) mutReverseCounts = sumLists(mutReverseCounts, mutCounts) start = max( 0, min(min(utr.getLength(), utrNormFactor), read.startRefPos)) end = max( 0, min(min(utr.getLength(), utrNormFactor), read.endRefPos)) for i in range(start, end): totalUtrCountRev[i] += 1 else: tcForwardCounts = sumLists(tcForwardCounts, tcCounts) mutForwardCounts = sumLists(mutForwardCounts, mutCounts) start = min( utr.getLength(), max(utr.getLength() - utrNormFactor, read.startRefPos)) end = min( utr.getLength(), max(utr.getLength() - utrNormFactor, read.endRefPos)) for i in range(start, end): normPos = utrNormFactor - (utr.getLength() - i) totalUtrCountFwd[normPos] += 1 tcPerPosFwd = sumLists(tcPerPosFwd, tcForwardCounts) allPerPosFwd = sumLists(allPerPosFwd, mutForwardCounts) tcPerPosRev = sumLists(tcPerPosRev, tcReverseCounts) allPerPosRev = sumLists(allPerPosRev, mutReverseCounts) counter += 1 if (verbose and counter % 10000 == 0): print("Handled " + str(counter) + " UTRs.", file=log) foTC = open(outputCSV, "w") print("# slamdunk tcperutr v" + __version__, file=foTC) reverseAllPerPosRev = allPerPosRev[::-1] reverseTcPerPosRev = tcPerPosRev[::-1] reverseTotalUtrCountRev = totalUtrCountRev[::-1] for i in range(0, utrNormFactor): print(allPerPosFwd[i], reverseAllPerPosRev[i], tcPerPosFwd[i], reverseTcPerPosRev[i], totalUtrCountFwd[i], reverseTotalUtrCountRev[i], sep='\t', file=foTC) foTC.close() if (not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per UTR position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -u -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: totalReadCountFwd = [0] * maxReadLength totalReadCountRev = [0] * maxReadLength tcPerPosRev = [0] * maxReadLength tcPerPosFwd = [0] * maxReadLength allPerPosRev = [0] * maxReadLength allPerPosFwd = [0] * maxReadLength snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minQual) for read in readIterator: tcCounts = [0] * maxReadLength mutCounts = [0] * maxReadLength for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): tcCounts[mismatch.readPosition] += 1 else: mutCounts[mismatch.readPosition] += 1 query_length = len(read.sequence) if (read.direction == ReadDirection.Reverse): tcPerPosRev = sumLists(tcPerPosRev, tcCounts) allPerPosRev = sumLists(allPerPosRev, mutCounts) for i in range(0, query_length): totalReadCountRev[i] += 1 else: tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts) allPerPosFwd = sumLists(allPerPosFwd, mutCounts) for i in range(0, query_length): totalReadCountFwd[i] += 1 foTC = open(outputCSV, "w") print("# slamdunk tcperreadpos v" + __version__, file=foTC) for i in range(0, maxReadLength): print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC) foTC.close() if (not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per reads position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def statsComputeOverallRatesPerUTR(referenceFile, bam, minBaseQual, strictTCs, outputCSV, outputPDF, utrBed, maxReadLength, log, printOnly=False, verbose=True, force=False): sampleInfo = getSampleInfo(bam) slamseqInfo = SlamSeqInfo(bam) if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) # UTR stats for MultiQC utrStats = dict() plotConversions = ['A>T', 'A>G', 'A>C', 'C>A', 'C>G', 'C>T', 'G>A', 'G>C', 'G>T', 'T>A', 'T>G', 'T>C', ] for conversion in plotConversions: utrStats[conversion] = list() f = tempfile.NamedTemporaryFile(delete=False) for utr in BedIterator(utrBed): readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minBaseQual) # Init totalRates = [0] * 25 readCount = 0 for read in readIterator: if (not read.isTcRead and strictTCs and read.tcCount > 0) : pass else : # Compute rates for current read rates = read.conversionRates # Add rates from read to total rates totalRates = sumLists(totalRates, rates) readCount += 1 print(utr.name, utr.chromosome, utr.start, utr.stop, utr.strand, readCount, "\t".join(str(x) for x in totalRates), sep="\t", file=f) # Process rates for MultiQC # Copied directly, too lazy to do it properly now utrDict = {} conversionSum = 0 A_A = totalRates[0] conversionSum =+ A_A A_C = totalRates[1] conversionSum =+ A_C A_G = totalRates[2] conversionSum =+ A_G A_T = totalRates[3] conversionSum =+ A_T C_A = totalRates[5] conversionSum =+ C_A C_C = totalRates[6] conversionSum =+ C_C C_G = totalRates[7] conversionSum =+ C_G C_T = totalRates[8] conversionSum =+ C_T G_A = totalRates[10] conversionSum =+ G_A G_C = totalRates[11] conversionSum =+ G_C G_G = totalRates[12] conversionSum =+ G_G G_T = totalRates[13] conversionSum =+ G_T T_A = totalRates[15] conversionSum =+ T_A T_C = totalRates[16] conversionSum =+ T_C T_G = totalRates[17] conversionSum =+ T_G T_T = totalRates[18] conversionSum =+ T_T if utr.strand == "-": A_A, T_T = T_T,A_A G_G, C_C = C_C,G_G A_C, T_G = T_G, A_C A_G, T_C = T_C, A_G A_T, T_A = T_A, A_T C_A, G_T = G_T, C_A C_G, G_C = G_C, C_G C_T, G_A = G_A, C_T if conversionSum > 0: Asum = A_A + A_C + A_G + A_T Csum = C_A + C_C + C_G + C_T Gsum = G_A + G_C + G_G + G_T Tsum = T_A + T_C + T_G + T_T if Asum > 0 : A_T = A_T / float(Asum) * 100 A_G = A_G / float(Asum) * 100 A_C = A_C / float(Asum) * 100 else : A_T = 0 A_G = 0 A_C = 0 if Csum > 0: C_A = C_A / float(Csum) * 100 C_G = C_G / float(Csum) * 100 C_T = C_T / float(Csum) * 100 else : C_A = 0 C_G = 0 C_T = 0 if Gsum > 0: G_A = G_A / float(Gsum) * 100 G_C = G_C / float(Gsum) * 100 G_T = G_T / float(Gsum) * 100 else : G_A = 0 G_C = 0 G_T = 0 if Tsum > 0: T_A = T_A / float(Tsum) * 100 T_G = T_G / float(Tsum) * 100 T_C = T_C / float(Tsum) * 100 else : T_A = 0 T_G = 0 T_C = 0 utrStats['A>T'].append(A_T) utrStats['A>G'].append(A_G) utrStats['A>C'].append(A_C) utrStats['C>A'].append(C_A) utrStats['C>G'].append(C_G) utrStats['C>T'].append(C_T) utrStats['G>A'].append(G_A) utrStats['G>T'].append(G_T) utrStats['G>C'].append(G_C) utrStats['T>A'].append(T_A) utrStats['T>G'].append(T_G) utrStats['T>C'].append(T_C) f.close() fo = open(outputCSV, "w") print("# slamdunk utrrates v" + __version__, file=fo) print("# Median-Conversions=",end="",file=fo) first = True for conversion in plotConversions: if (not first) : print(',',file=fo, end="") else : first = False print(conversion + ":" + str(np.median(utrStats[conversion])),file=fo, end="") print(file=fo) print("Name", "Chr", "Start", "End", "Strand", "ReadCount", sep="\t", end="\t", file=fo) for i in range(0, 5): for j in range(0, 5): print(toBase[i].upper() + "_" + toBase[j].upper(), end="", file=fo) if(i != 4 or j != 4): print("\t", end="", file=fo) print(file=fo) with open(f.name, "rb") as valueFile: fo.write(valueFile.read()) fo.close() if(not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing global rate pdfs for file " + bam, file=log) else: f = tempfile.NamedTemporaryFile(delete=False) print(sampleInfo.Name, outputCSV, sep='\t', file=f) f.close() callR(getPlotter("globalRatePlotter") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def statsComputeTCContext(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init # combinations = ["AT","CT","GT","TT","NT","AA","CA","GA","TA","NA"] frontCombinations = ["AT", "CT", "GT", "TT", "NT"] backCombinations = ["TA", "TC", "TG", "TT", "TN"] counts = {} counts['5prime'] = {} counts['3prime'] = {} counts['5prime']['fwd'] = {} counts['5prime']['rev'] = {} counts['3prime']['fwd'] = {} counts['3prime']['rev'] = {} for combination in frontCombinations: counts['5prime']['fwd'][combination] = 0 counts['5prime']['rev'][combination] = 0 for combination in backCombinations: counts['3prime']['fwd'][combination] = 0 counts['3prime']['rev'][combination] = 0 bamFile = pysam.AlignmentFile(bam, "rb") # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: for read in bamFile.fetch(region=chromosome): i = 0 while i < len(read.query_sequence): if (read.query_sequence[i] == "T" and not read.is_reverse): frontContext = None backContext = None if (i > 0): frontContext = read.query_sequence[i - 1] if (i < (len(read.query_sequence) - 1)): backContext = read.query_sequence[i + 1] if (frontContext != None): counts['5prime']['fwd'][frontContext + "T"] += 1 if (backContext != None): counts['3prime']['fwd']["T" + backContext] += 1 if (read.query_sequence[i] == "A" and read.is_reverse): frontContext = None backContext = None if (i > 0): backContext = read.query_sequence[i - 1] if (i < (len(read.query_sequence) - 1)): frontContext = read.query_sequence[i + 1] if (frontContext != None): counts['5prime']['rev'][complement(frontContext + "A")] += 1 if (backContext != None): counts['3prime']['rev'][complement( "A" + backContext)] += 1 i += 1 # Print rates in correct format for plotting fo = open(outputCSV, "w") print("\t".join(frontCombinations), file=fo) frontFwdLine = "" frontRevLine = "" backFwdLine = "" backRevLine = "" for combination in frontCombinations: frontFwdLine += str(counts['5prime']['fwd'][combination]) + "\t" frontRevLine += str(counts['5prime']['rev'][combination]) + "\t" print(frontFwdLine.rstrip(), file=fo) print(frontRevLine.rstrip(), file=fo) print("\t".join(backCombinations), file=fo) for combination in backCombinations: backFwdLine += str(counts['3prime']['fwd'][combination]) + "\t" backRevLine += str(counts['3prime']['rev'][combination]) + "\t" print(backFwdLine.rstrip(), file=fo) print(backRevLine.rstrip(), file=fo) fo.close() if (not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: f = tempfile.NamedTemporaryFile(delete=False) print(removeExtension(os.path.basename(bam)), outputCSV, sep='\t', file=f) f.close() callR(getPlotter("compute_context_TC_rates") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): if(printOnly or checkStep([inputBAM], [outputBAM], force)): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 multimapper = 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if (bed == None) : print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log) for read in infile: if(not read.is_secondary and not read.is_supplementary): if(read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 if(read.is_unmapped): continue if(read.mapping_quality < MQ): mqFiltered += 1 continue if(float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if(NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if(not read.is_secondary and not read.is_supplementary): filteredReads += 1 outfile.write(read) print("Criterion\tFiltered reads",file=log) print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log) print("MM\t0",file=log) else : # Multimap retention strategy filtering when bed is supplied random.seed(1) print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log) mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if (bed != None) : slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else : slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } if('PG' in inFileBamHeader): inFileBamHeader['PG'].append(slamDunkPG) else: inFileBamHeader['PG'] = [ slamDunkPG ] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, verbose) pysamIndex(outputBAM) #pysamFlagstat(outputBAM) #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) else: print("Skipped filtering for " + inputBAM, file=log)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): if (printOnly or checkStep([inputBAM], [outputBAM], force)): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 multimapper = 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if (bed == None): print("#No bed-file supplied. Running default filtering on " + inputBAM + ".", file=log) for read in infile: if (not read.is_secondary and not read.is_supplementary): if (read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 if (read.is_unmapped): continue if (read.mapping_quality < MQ): mqFiltered += 1 continue if (float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if (NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if (not read.is_secondary and not read.is_supplementary): filteredReads += 1 outfile.write(read) print("Criterion\tFiltered reads", file=log) print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered), file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log) print("MM\t0", file=log) else: # Multimap retention strategy filtering when bed is supplied random.seed(1) print( "#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".", file=log) mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment( infile, outfile, bed, minIdentity, NM, log) #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if (bed != None): slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else: slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } if ('PG' in inFileBamHeader): inFileBamHeader['PG'].append(slamDunkPG) else: inFileBamHeader['PG'] = [slamDunkPG] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, verbose) pysamIndex(outputBAM) #pysamFlagstat(outputBAM) #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) else: print("Skipped filtering for " + inputBAM, file=log)
def computeSNPMaskedRates (ref, bed, snpsFile, bam, maxReadLength, minQual, coverageCutoff, variantFraction, outputCSV, outputPDF, strictTCs, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, ref], [outputCSV], force)): print("Skipped computing T->C per UTR with SNP masking for file " + bam, file=log) else: fileCSV = open(outputCSV,'w') snps = SNPtools.SNPDictionary(snpsFile) snps.read() #Go through one chr after the other testFile = SlamSeqBamFile(bam, ref, snps) progress = 0 for utr in BedIterator(bed): if(not utr.hasStrand()): raise RuntimeError("Input BED file does not contain stranded intervals.") if utr.start < 0: raise RuntimeError("Negativ start coordinate found. Please check the following entry in your BED file: " + utr) readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual) unmaskedTCCount = 0 maskedTCCount = 0 readCount = 0 for read in readIterator: # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions) if (not read.isTcRead and strictTCs) : read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 isTC = False isTrueTC = False for mismatch in read.mismatches: if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): isTrueTC = True unmasked = False if (read.direction == ReadDirection.Reverse and mismatch.referenceBase == "A" and mismatch.readBase == "G"): unmasked = True elif (read.direction != ReadDirection.Reverse and mismatch.referenceBase == "T" and mismatch.readBase == "C") : unmasked = True if (unmasked and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()) : isTC = True readCount += 1 if (isTC) : unmaskedTCCount += 1 if (isTrueTC) : maskedTCCount += 1 containsSNP = 0 if (unmaskedTCCount != maskedTCCount) : containsSNP = 1 print(utr.name + "\t" + str(readCount) + "\t" + str(unmaskedTCCount) + "\t" + str(maskedTCCount) + "\t" + str(containsSNP), file=fileCSV) progress += 1 fileCSV.close() if(not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per UTR position plot for file " + bam, file=log) else: callR(getPlotter("SNPeval") + " -i " + outputCSV + " -c " + str(coverageCutoff) + " -v " + str(variantFraction) + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def tcPerUtr(referenceFile, utrBed, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per UTR position for file " + bam, file=log) else: counter = 0 totalUtrCountFwd = [0] * utrNormFactor totalUtrCountRev = [0] * utrNormFactor tcPerPosRev = [0] * utrNormFactor tcPerPosFwd = [0] * utrNormFactor allPerPosRev = [0] * utrNormFactor allPerPosFwd = [0] * utrNormFactor snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one utr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) for utr in BedIterator(utrBed): readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual) tcForwardCounts = [0] * utrNormFactor mutForwardCounts = [0] * utrNormFactor tcReverseCounts = [0] * utrNormFactor mutReverseCounts = [0] * utrNormFactor for read in readIterator: tcCounts = [0] * utrNormFactor mutCounts = [0] * utrNormFactor for mismatch in read.mismatches: mismatchPos = mismatch.referencePosition # mismatchPos = read.startRefPos if (utr.strand == "+") : # New try for UTRs (remove + 1 if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength()) : # if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength() + 1) : mismatchPos = utrNormFactor - (utr.getLength() - mismatchPos) if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)): tcCounts[mismatchPos] += 1 else : mutCounts[mismatchPos] += 1 else : if (mismatchPos >= 0 and mismatchPos < min(utr.getLength(), utrNormFactor)) : if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)): tcCounts[mismatchPos] += 1 else : mutCounts[mismatchPos] += 1 if(read.direction == ReadDirection.Reverse): tcReverseCounts = sumLists(tcReverseCounts, tcCounts) mutReverseCounts = sumLists(mutReverseCounts, mutCounts) start = max(0, min(min(utr.getLength(), utrNormFactor), read.startRefPos)) end = max(0, min(min(utr.getLength(), utrNormFactor), read.endRefPos)) for i in range(start, end): totalUtrCountRev[i] += 1 else: tcForwardCounts = sumLists(tcForwardCounts, tcCounts) mutForwardCounts = sumLists(mutForwardCounts, mutCounts) start = min(utr.getLength(), max(utr.getLength() - utrNormFactor, read.startRefPos)) end = min(utr.getLength(), max(utr.getLength() - utrNormFactor, read.endRefPos)) for i in range(start, end): normPos = utrNormFactor - (utr.getLength() - i) totalUtrCountFwd[normPos] += 1 tcPerPosFwd = sumLists(tcPerPosFwd, tcForwardCounts) allPerPosFwd = sumLists(allPerPosFwd, mutForwardCounts) tcPerPosRev = sumLists(tcPerPosRev, tcReverseCounts) allPerPosRev = sumLists(allPerPosRev, mutReverseCounts) counter += 1 if (verbose and counter % 10000 == 0) : print("Handled " + str(counter) + " UTRs.", file=log) foTC = open(outputCSV, "w") print("# slamdunk tcperutr v" + __version__, file=foTC) reverseAllPerPosRev = allPerPosRev[::-1] reverseTcPerPosRev = tcPerPosRev[::-1] reverseTotalUtrCountRev = totalUtrCountRev[::-1] for i in range(0, utrNormFactor): print(allPerPosFwd[i], reverseAllPerPosRev[i], tcPerPosFwd[i], reverseTcPerPosRev[i], totalUtrCountFwd[i], reverseTotalUtrCountRev[i], sep='\t', file=foTC) foTC.close() if(not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per UTR position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -u -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: totalReadCountFwd = [0] * maxReadLength totalReadCountRev = [0] * maxReadLength tcPerPosRev = [0] * maxReadLength tcPerPosFwd = [0] * maxReadLength allPerPosRev = [0] * maxReadLength allPerPosFwd = [0] * maxReadLength snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minQual) for read in readIterator: tcCounts = [0] * maxReadLength mutCounts = [0] * maxReadLength for mismatch in read.mismatches: if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)): tcCounts[mismatch.readPosition] += 1 else : mutCounts[mismatch.readPosition] += 1 query_length = len(read.sequence) if(read.direction == ReadDirection.Reverse): tcPerPosRev = sumLists(tcPerPosRev, tcCounts) allPerPosRev = sumLists(allPerPosRev, mutCounts) for i in range(0, query_length): totalReadCountRev[i] += 1 else: tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts) allPerPosFwd = sumLists(allPerPosFwd, mutCounts) for i in range(0, query_length): totalReadCountFwd[i] += 1 foTC = open(outputCSV, "w") print("# slamdunk tcperreadpos v" + __version__, file=foTC) for i in range(0, maxReadLength): print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC) foTC.close() if(not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per reads position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose = True, force=False): if(printOnly or checkStep([inputBAM], [outputBAM], force)): samfile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile) processedReads = 0 retainedReads = 0 prevChr = "" prevStart = "" duplicateBuffer = {} for read in samfile: flag = read.cigarstring chr = read.reference_id start = read.reference_start seq = read.query_sequence if (read.has_tag("TC")) : tcflag = read.get_tag("TC") else : tcflag = 0 if (tcflag >= tcMutations) : if (chr != prevChr or start != prevStart) : if (prevChr != "") : for curSeq in duplicateBuffer : for curFlag in duplicateBuffer[curSeq]: for readEntry in duplicateBuffer[curSeq][curFlag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() if not seq in duplicateBuffer: duplicateBuffer[seq] = {} if not flag in duplicateBuffer[seq]: duplicateBuffer[seq][flag] = list() if len(duplicateBuffer[seq][flag]) > 0 : read.is_duplicate = True duplicateBuffer[seq][flag].append(read) prevChr = chr prevStart = start processedReads += 1 for seq in duplicateBuffer: for flag in duplicateBuffer[seq] : for readEntry in duplicateBuffer[seq][flag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() outfile.close() print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end = "") print("{0:.2f}".format(float(retainedReads) / float(processedReads)),file=log,end="") print(" compression rate)", file=log) pysamIndex(outputBAM) else: print("Skipped deduplication for " + inputBAM, file=log)
def statsComputeOverallRatesPerUTR(referenceFile, bam, minBaseQual, strictTCs, outputCSV, outputPDF, utrBed, maxReadLength, log, printOnly=False, verbose=True, force=False): sampleInfo = getSampleInfo(bam) slamseqInfo = SlamSeqInfo(bam) if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) # UTR stats for MultiQC utrStats = dict() plotConversions = [ 'A>T', 'A>G', 'A>C', 'C>A', 'C>G', 'C>T', 'G>A', 'G>C', 'G>T', 'T>A', 'T>G', 'T>C', ] for conversion in plotConversions: utrStats[conversion] = list() f = tempfile.NamedTemporaryFile(delete=False) for utr in BedIterator(utrBed): readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minBaseQual) # Init totalRates = [0] * 25 readCount = 0 for read in readIterator: if (not read.isTcRead and strictTCs and read.tcCount > 0): pass else: # Compute rates for current read rates = read.conversionRates # Add rates from read to total rates totalRates = sumLists(totalRates, rates) readCount += 1 print(utr.name, utr.chromosome, utr.start, utr.stop, utr.strand, readCount, "\t".join(str(x) for x in totalRates), sep="\t", file=f) # Process rates for MultiQC # Copied directly, too lazy to do it properly now utrDict = {} conversionSum = 0 A_A = totalRates[0] conversionSum = +A_A A_C = totalRates[1] conversionSum = +A_C A_G = totalRates[2] conversionSum = +A_G A_T = totalRates[3] conversionSum = +A_T C_A = totalRates[5] conversionSum = +C_A C_C = totalRates[6] conversionSum = +C_C C_G = totalRates[7] conversionSum = +C_G C_T = totalRates[8] conversionSum = +C_T G_A = totalRates[10] conversionSum = +G_A G_C = totalRates[11] conversionSum = +G_C G_G = totalRates[12] conversionSum = +G_G G_T = totalRates[13] conversionSum = +G_T T_A = totalRates[15] conversionSum = +T_A T_C = totalRates[16] conversionSum = +T_C T_G = totalRates[17] conversionSum = +T_G T_T = totalRates[18] conversionSum = +T_T if utr.strand == "-": A_A, T_T = T_T, A_A G_G, C_C = C_C, G_G A_C, T_G = T_G, A_C A_G, T_C = T_C, A_G A_T, T_A = T_A, A_T C_A, G_T = G_T, C_A C_G, G_C = G_C, C_G C_T, G_A = G_A, C_T if conversionSum > 0: Asum = A_A + A_C + A_G + A_T Csum = C_A + C_C + C_G + C_T Gsum = G_A + G_C + G_G + G_T Tsum = T_A + T_C + T_G + T_T if Asum > 0: A_T = A_T / float(Asum) * 100 A_G = A_G / float(Asum) * 100 A_C = A_C / float(Asum) * 100 else: A_T = 0 A_G = 0 A_C = 0 if Csum > 0: C_A = C_A / float(Csum) * 100 C_G = C_G / float(Csum) * 100 C_T = C_T / float(Csum) * 100 else: C_A = 0 C_G = 0 C_T = 0 if Gsum > 0: G_A = G_A / float(Gsum) * 100 G_C = G_C / float(Gsum) * 100 G_T = G_T / float(Gsum) * 100 else: G_A = 0 G_C = 0 G_T = 0 if Tsum > 0: T_A = T_A / float(Tsum) * 100 T_G = T_G / float(Tsum) * 100 T_C = T_C / float(Tsum) * 100 else: T_A = 0 T_G = 0 T_C = 0 utrStats['A>T'].append(A_T) utrStats['A>G'].append(A_G) utrStats['A>C'].append(A_C) utrStats['C>A'].append(C_A) utrStats['C>G'].append(C_G) utrStats['C>T'].append(C_T) utrStats['G>A'].append(G_A) utrStats['G>T'].append(G_T) utrStats['G>C'].append(G_C) utrStats['T>A'].append(T_A) utrStats['T>G'].append(T_G) utrStats['T>C'].append(T_C) f.close() fo = open(outputCSV, "w") print("# slamdunk utrrates v" + __version__, file=fo) print("# Median-Conversions=", end="", file=fo) first = True for conversion in plotConversions: if (not first): print(',', file=fo, end="") else: first = False print(conversion + ":" + str(np.median(utrStats[conversion])), file=fo, end="") print(file=fo) print("Name", "Chr", "Start", "End", "Strand", "ReadCount", sep="\t", end="\t", file=fo) for i in range(0, 5): for j in range(0, 5): print(toBase[i].upper() + "_" + toBase[j].upper(), end="", file=fo) if (i != 4 or j != 4): print("\t", end="", file=fo) print(file=fo) with open(f.name, "rb") as valueFile: fo.write(valueFile.read()) fo.close() if (not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing global rate pdfs for file " + bam, file=log) else: f = tempfile.NamedTemporaryFile(delete=False) print(sampleInfo.Name, outputCSV, sep='\t', file=f) f.close() callR(getPlotter("globalRatePlotter") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def statsComputeTCContext(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init # combinations = ["AT","CT","GT","TT","NT","AA","CA","GA","TA","NA"] frontCombinations = ["AT", "CT", "GT", "TT", "NT"] backCombinations = ["TA", "TC", "TG", "TT", "TN"] counts = {} counts['5prime'] = {} counts['3prime'] = {} counts['5prime']['fwd'] = {} counts['5prime']['rev'] = {} counts['3prime']['fwd'] = {} counts['3prime']['rev'] = {} for combination in frontCombinations : counts['5prime']['fwd'][combination] = 0 counts['5prime']['rev'][combination] = 0 for combination in backCombinations: counts['3prime']['fwd'][combination] = 0 counts['3prime']['rev'][combination] = 0 bamFile = pysam.AlignmentFile(bam, "rb") # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: for read in bamFile.fetch(region=chromosome): i = 0 while i < len(read.query_sequence): if(read.query_sequence[i] == "T" and not read.is_reverse) : frontContext = None backContext = None if (i > 0) : frontContext = read.query_sequence[i - 1] if (i < (len(read.query_sequence) - 1)) : backContext = read.query_sequence[i + 1] if (frontContext != None) : counts['5prime']['fwd'][frontContext + "T"] += 1 if (backContext != None) : counts['3prime']['fwd']["T" + backContext] += 1 if(read.query_sequence[i] == "A" and read.is_reverse) : frontContext = None backContext = None if (i > 0) : backContext = read.query_sequence[i - 1] if (i < (len(read.query_sequence) - 1)) : frontContext = read.query_sequence[i + 1] if (frontContext != None) : counts['5prime']['rev'][complement(frontContext + "A")] += 1 if (backContext != None) : counts['3prime']['rev'][complement("A" + backContext)] += 1 i += 1 # Print rates in correct format for plotting fo = open(outputCSV, "w") print("\t".join(frontCombinations), file=fo) frontFwdLine = "" frontRevLine = "" backFwdLine = "" backRevLine = "" for combination in frontCombinations : frontFwdLine += str(counts['5prime']['fwd'][combination]) + "\t" frontRevLine += str(counts['5prime']['rev'][combination]) + "\t" print(frontFwdLine.rstrip(), file=fo) print(frontRevLine.rstrip(), file=fo) print("\t".join(backCombinations), file=fo) for combination in backCombinations : backFwdLine += str(counts['3prime']['fwd'][combination]) + "\t" backRevLine += str(counts['3prime']['rev'][combination]) + "\t" print(backFwdLine.rstrip(), file=fo) print(backRevLine.rstrip(), file=fo) fo.close() if(not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: f = tempfile.NamedTemporaryFile(delete=False) print(removeExtension(os.path.basename(bam)), outputCSV, sep='\t', file=f) f.close() callR(getPlotter("compute_context_TC_rates") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)