def statsComputeOverallRates(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init totalRatesFwd = [0] * 25 totalRatesRev = [0] * 25 tcCount = [0] * 100 # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: # Compute rates for current read rates = read.conversionRates # Get T -> C conversions for current read tc = read.tcCount tcCount[tc] += 1 # Add rates from read to total rates if(read.direction == ReadDirection.Reverse): totalRatesRev = sumLists(totalRatesRev, rates) else: totalRatesFwd = sumLists(totalRatesFwd, rates) # Print rates in correct format for plotting fo = open(outputCSV, "w") print("# slamdunk rates v" + __version__, file=fo) printRates(totalRatesFwd, totalRatesRev, fo) fo.close() if(not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: #f = tempfile.NamedTemporaryFile(delete=False) #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f) #f.close() callR(getPlotter("compute_overall_rates") + " -f " + outputCSV + " -n " + removeExtension(os.path.basename(bam)) + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput): print("Run " + bam) # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() bedTree = bedToIntervallTree(bed) #evalHist = [0] * outFile = open(outputFile, "w") print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep = "\t", file=outFile) total = 0 correct = 0 correcPosWrongTC = 0 wrongPos = 0 minBaseQual = 0 for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: total += 1 simInfo = read.name.split("_") utrSim = simInfo[0] tcCountSim = int(simInfo[2]) utrFound = None if read.chromosome in bedTree: overlaps = list(bedTree[read.chromosome][read.startRefPos:read.endRefPos]) if len(overlaps) > 0: utrFound = overlaps[0].data if utrFound == utrSim: if tcCountSim == read.tcCount: correct += 1 else: correcPosWrongTC += 1 else: wrongPos += 1 print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep = "\t", file=outFile) print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total)
def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: snps = SNPtools.SNPDictionary(snpsFile) snps.read() outputFile = SlamSeqWriter(outputCSV) #Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome) for read in readIterator: outputFile.write(read) outputFile.close()
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: totalReadCountFwd = [0] * maxReadLength totalReadCountRev = [0] * maxReadLength tcPerPosRev = [0] * maxReadLength tcPerPosFwd = [0] * maxReadLength allPerPosRev = [0] * maxReadLength allPerPosFwd = [0] * maxReadLength snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minQual) for read in readIterator: tcCounts = [0] * maxReadLength mutCounts = [0] * maxReadLength for mismatch in read.mismatches: if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)): tcCounts[mismatch.readPosition] += 1 else : mutCounts[mismatch.readPosition] += 1 query_length = len(read.sequence) if(read.direction == ReadDirection.Reverse): tcPerPosRev = sumLists(tcPerPosRev, tcCounts) allPerPosRev = sumLists(allPerPosRev, mutCounts) for i in range(0, query_length): totalReadCountRev[i] += 1 else: tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts) allPerPosFwd = sumLists(allPerPosFwd, mutCounts) for i in range(0, query_length): totalReadCountFwd[i] += 1 foTC = open(outputCSV, "w") print("# slamdunk tcperreadpos v" + __version__, file=foTC) for i in range(0, maxReadLength): print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC) foTC.close() if(not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per reads position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def statsComputeTCContext(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init # combinations = ["AT","CT","GT","TT","NT","AA","CA","GA","TA","NA"] frontCombinations = ["AT", "CT", "GT", "TT", "NT"] backCombinations = ["TA", "TC", "TG", "TT", "TN"] counts = {} counts['5prime'] = {} counts['3prime'] = {} counts['5prime']['fwd'] = {} counts['5prime']['rev'] = {} counts['3prime']['fwd'] = {} counts['3prime']['rev'] = {} for combination in frontCombinations : counts['5prime']['fwd'][combination] = 0 counts['5prime']['rev'][combination] = 0 for combination in backCombinations: counts['3prime']['fwd'][combination] = 0 counts['3prime']['rev'][combination] = 0 bamFile = pysam.AlignmentFile(bam, "rb") # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: for read in bamFile.fetch(region=chromosome): i = 0 while i < len(read.query_sequence): if(read.query_sequence[i] == "T" and not read.is_reverse) : frontContext = None backContext = None if (i > 0) : frontContext = read.query_sequence[i - 1] if (i < (len(read.query_sequence) - 1)) : backContext = read.query_sequence[i + 1] if (frontContext != None) : counts['5prime']['fwd'][frontContext + "T"] += 1 if (backContext != None) : counts['3prime']['fwd']["T" + backContext] += 1 if(read.query_sequence[i] == "A" and read.is_reverse) : frontContext = None backContext = None if (i > 0) : backContext = read.query_sequence[i - 1] if (i < (len(read.query_sequence) - 1)) : frontContext = read.query_sequence[i + 1] if (frontContext != None) : counts['5prime']['rev'][complement(frontContext + "A")] += 1 if (backContext != None) : counts['3prime']['rev'][complement("A" + backContext)] += 1 i += 1 # Print rates in correct format for plotting fo = open(outputCSV, "w") print("\t".join(frontCombinations), file=fo) frontFwdLine = "" frontRevLine = "" backFwdLine = "" backRevLine = "" for combination in frontCombinations : frontFwdLine += str(counts['5prime']['fwd'][combination]) + "\t" frontRevLine += str(counts['5prime']['rev'][combination]) + "\t" print(frontFwdLine.rstrip(), file=fo) print(frontRevLine.rstrip(), file=fo) print("\t".join(backCombinations), file=fo) for combination in backCombinations : backFwdLine += str(counts['3prime']['fwd'][combination]) + "\t" backRevLine += str(counts['3prime']['rev'][combination]) + "\t" print(backFwdLine.rstrip(), file=fo) print(backRevLine.rstrip(), file=fo) fo.close() if(not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: f = tempfile.NamedTemporaryFile(delete=False) print(removeExtension(os.path.basename(bam)), outputCSV, sep='\t', file=f) f.close() callR(getPlotter("compute_context_TC_rates") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def genomewideConversionRates(referenceFile, snpsFile, bam, minBaseQual, outputBedGraphPrefix, conversionThreshold, coverageCutoff, log): ref = pysam.FastaFile(referenceFile) snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() bedGraphInfo = re.sub("_slamdunk_mapped.*","",basename(outputBedGraphPrefix)) print(bedGraphInfo) fileBedGraphRatesPlus = open(outputBedGraphPrefix + "_TC_rates_genomewide.bedGraph", 'w') fileBedGraphRatesMinus = open(outputBedGraphPrefix + "_AG_rates_genomewide.bedGraph", 'w') fileBedGraphCoveragePlus = open(outputBedGraphPrefix + "_coverage_plus_genomewide.bedGraph", 'w') fileBedGraphCoverageMinus = open(outputBedGraphPrefix + "_coverage_minus_genomewide.bedGraph", 'w') fileBedGraphTCConversions = open(outputBedGraphPrefix + "_TC_conversions_genomewide.bedGraph", 'w') fileBedGraphAGConversions = open(outputBedGraphPrefix + "_AG_conversions_genomewide.bedGraph", 'w') fileBedGraphT = open(outputBedGraphPrefix + "_coverage_T_genomewide.bedGraph", 'w') fileBedGraphA = open(outputBedGraphPrefix + "_coverage_A_genomewide.bedGraph", 'w') print("track type=bedGraph name=\"" + bedGraphInfo + " tc-conversions\" description=\"# T->C conversions / # reads on T per position genome-wide\"", file=fileBedGraphRatesPlus) print("track type=bedGraph name=\"" + bedGraphInfo + " ag-conversions\" description=\"# A->G conversions / # reads on A per position genome-wide\"", file=fileBedGraphRatesMinus) print("track type=bedGraph name=\"" + bedGraphInfo + " plus-strand coverage\" description=\"# Reads on plus strand genome-wide\"", file=fileBedGraphCoveragePlus) print("track type=bedGraph name=\"" + bedGraphInfo + " minus-strand coverage\" description=\"# Reads on minus strand genome-wide\"", file=fileBedGraphCoverageMinus) print("track type=bedGraph name=\"" + bedGraphInfo + " T->C conversions\" description=\"# T->C conversions on plus strand genome-wide\"", file=fileBedGraphTCConversions) print("track type=bedGraph name=\"" + bedGraphInfo + " A->G conversions\" description=\"# A->G conversions on minus strand genome-wide\"", file=fileBedGraphAGConversions) print("track type=bedGraph name=\"" + bedGraphInfo + " T-coverage\" description=\"# Plus-strand reads on Ts genome-wide\"", file=fileBedGraphT) print("track type=bedGraph name=\"" + bedGraphInfo + " A-coverage\" description=\"# Minus-strand reads on As genome-wide\"", file=fileBedGraphA) for chromosome in chromosomes: chrLength = testFile.getChromosomeLength(chromosome) tcCount = [0] * chrLength agCount = [0] * chrLength coveragePlus = [0] * chrLength coverageMinus = [0] * chrLength tCoverage = [0] * chrLength aCoverage = [0] * chrLength readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold) for read in readIterator: if (not read.isTcRead) : read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 for mismatch in read.mismatches: if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < chrLength): if read.direction == ReadDirection.Reverse: agCount[mismatch.referencePosition] += 1 else : tcCount[mismatch.referencePosition] += 1 for i in range(read.startRefPos, read.endRefPos): if(i >= 0 and i < chrLength): if read.direction == ReadDirection.Reverse: coverageMinus[i] += 1 else : coveragePlus[i] += 1 prevCoveragePlus = 0 prevCoveragePlusPos = 0 prevCoverageMinus = 0 prevCoverageMinusPos = 0 prevTCConversionRate = 0 prevTCConversionRatePos = 0 prevAGConversionRate = 0 prevAGConversionRatePos = 0 prevTCConversions = 0 prevTCConversionPos = 0 prevAGConversions = 0 prevAGConversionPos = 0 prevTCoverage = 0 prevTCoveragePos = 0 prevACoverage = 0 prevACoveragePos = 0 for pos in range(0, chrLength): if prevCoveragePlus != coveragePlus[pos]: print(chromosome + "\t" + str(prevCoveragePlusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoveragePlus), file = fileBedGraphCoveragePlus) prevCoveragePlus = coveragePlus[pos] prevCoveragePlusPos = pos if prevCoverageMinus != coverageMinus[pos]: print(chromosome + "\t" + str(prevCoverageMinusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoverageMinus), file = fileBedGraphCoverageMinus) prevCoverageMinus = coverageMinus[pos] prevCoverageMinusPos = pos tCoverage = 0 if coveragePlus[pos] > 0: base = ref.fetch(reference=chromosome, start = pos + 1, end = pos + 2) if base.upper() == "T": tCoverage = coveragePlus[pos] aCoverage = 0 if coverageMinus[pos] > 0: base = ref.fetch(reference=chromosome, start = pos + 1, end = pos + 2) if base.upper() == "A": aCoverage = coverageMinus[pos] if prevTCoverage != tCoverage: print(chromosome + "\t" + str(prevTCoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCoverage), file = fileBedGraphT) prevTCoverage = tCoverage prevTCoveragePos = pos if prevACoverage != aCoverage: print(chromosome + "\t" + str(prevACoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevACoverage), file = fileBedGraphA) prevACoverage = aCoverage prevACoveragePos = pos if prevTCConversions != tcCount[pos]: print(chromosome + "\t" + str(prevTCConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversions), file = fileBedGraphTCConversions) prevTCConversions = tcCount[pos] prevTCConversionPos = pos if prevAGConversions != agCount[pos]: print(chromosome + "\t" + str(prevAGConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversions), file = fileBedGraphAGConversions) prevAGConversions = agCount[pos] prevAGConversionPos = pos TCconversionRate = 0 if coveragePlus[pos] > 0 and coveragePlus[pos] >= coverageCutoff: TCconversionRate = float(tcCount[pos]) / float(coveragePlus[pos]) AGconversionRate = 0 if coverageMinus[pos] > 0 and coverageMinus[pos] >= coverageCutoff: AGconversionRate = float(agCount[pos]) / float(coverageMinus[pos]) if prevTCConversionRate != TCconversionRate: print(chromosome + "\t" + str(prevTCConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversionRate), file = fileBedGraphRatesPlus) prevTCConversionRate = TCconversionRate prevTCConversionRatePos = pos if prevAGConversionRate != AGconversionRate: print(chromosome + "\t" + str(prevAGConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversionRate), file = fileBedGraphRatesMinus) prevAGConversionRate = AGconversionRate prevAGConversionRatePos = pos fileBedGraphRatesPlus.close() fileBedGraphRatesMinus.close() fileBedGraphCoveragePlus.close() fileBedGraphCoverageMinus.close() fileBedGraphTCConversions.close() fileBedGraphAGConversions.close() fileBedGraphT.close() fileBedGraphA.close()
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput): print("Run " + bam) # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() bedTree = bedToIntervallTree(bed) #evalHist = [0] * outFile = open(outputFile, "w") print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep="\t", file=outFile) total = 0 correct = 0 correcPosWrongTC = 0 wrongPos = 0 minBaseQual = 0 for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: total += 1 simInfo = read.name.split("_") utrSim = simInfo[0] tcCountSim = int(simInfo[2]) utrFound = None if read.chromosome in bedTree: overlaps = list( bedTree[read.chromosome][read.startRefPos:read.endRefPos]) if len(overlaps) > 0: utrFound = overlaps[0].data if utrFound == utrSim: if tcCountSim == read.tcCount: correct += 1 else: correcPosWrongTC += 1 else: wrongPos += 1 print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep="\t", file=outFile) print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total)