def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: snps = SNPtools.SNPDictionary(snpsFile) snps.read() outputFile = SlamSeqWriter(outputCSV) #Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome) for read in readIterator: outputFile.write(read) outputFile.close()
def getConversionRateFromBam(bam, ref, chromosome, start, end, strand): testFile = SlamSeqBamFile(bam, ref, SNPtools.SNPDictionary(None)) sumConversionRate = 0 readCount = 0 #for chromosome in testFile.getChromosomes(): # readIterator = testFile.readsInChromosome(chromosome) #readIterator = testFile.readInRegion("chr7", 3217778, 3221036, "+", 55) readIterator = testFile.readInRegion(chromosome, start, end, strand, 100) for read in readIterator: conversionRate = 0 if (read.tCount > 0): conversionRate = read.tcCount * 1.0 / read.tCount #if(read.tcCount > 0): sumConversionRate += conversionRate readCount += 1 #if(readCount % 1000 == 0 and readCount > 0): # print(str(readCount) + ": " + str(sumConversionRate) + " / " + str(readCount) + " = " + str(sumConversionRate / readCount)) #if(readCount >= 10000): # break print("Read count: " + str(readCount)) print("Avg. conversion rate: " + str(sumConversionRate / readCount))
def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual, outputBAMPrefix, conversionThreshold, log): ref = pysam.FastaFile(referenceFile) snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) samFile = pysam.AlignmentFile(bam, "rb") chromosomes = testFile.getChromosomes() backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam" tcReadFileName = outputBAMPrefix + "_TCReads.bam" backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName, "wb", template=samFile) tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile) tcReadDict = dict() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold) for read in readIterator: if (read.isTcRead): tcReadDict[read.name] = 0 for read in samFile.fetch(): if read.query_name in tcReadDict: tcReadFile.write(read) else: backgroundReadFile.write(read) backgroundReadFile.close() tcReadFile.close() pysamIndex(backgroundReadFileName) pysamIndex(tcReadFileName)
def genomewideConversionRates(referenceFile, snpsFile, bam, minBaseQual, outputBedGraphPrefix, conversionThreshold, coverageCutoff, log): ref = pysam.FastaFile(referenceFile) snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() bedGraphInfo = re.sub("_slamdunk_mapped.*", "", basename(outputBedGraphPrefix)) print(bedGraphInfo) fileBedGraphRatesPlus = open( outputBedGraphPrefix + "_TC_rates_genomewide.bedGraph", 'w') fileBedGraphRatesMinus = open( outputBedGraphPrefix + "_AG_rates_genomewide.bedGraph", 'w') fileBedGraphCoveragePlus = open( outputBedGraphPrefix + "_coverage_plus_genomewide.bedGraph", 'w') fileBedGraphCoverageMinus = open( outputBedGraphPrefix + "_coverage_minus_genomewide.bedGraph", 'w') fileBedGraphTCConversions = open( outputBedGraphPrefix + "_TC_conversions_genomewide.bedGraph", 'w') fileBedGraphAGConversions = open( outputBedGraphPrefix + "_AG_conversions_genomewide.bedGraph", 'w') fileBedGraphT = open( outputBedGraphPrefix + "_coverage_T_genomewide.bedGraph", 'w') fileBedGraphA = open( outputBedGraphPrefix + "_coverage_A_genomewide.bedGraph", 'w') print( "track type=bedGraph name=\"" + bedGraphInfo + " tc-conversions\" description=\"# T->C conversions / # reads on T per position genome-wide\"", file=fileBedGraphRatesPlus) print( "track type=bedGraph name=\"" + bedGraphInfo + " ag-conversions\" description=\"# A->G conversions / # reads on A per position genome-wide\"", file=fileBedGraphRatesMinus) print( "track type=bedGraph name=\"" + bedGraphInfo + " plus-strand coverage\" description=\"# Reads on plus strand genome-wide\"", file=fileBedGraphCoveragePlus) print( "track type=bedGraph name=\"" + bedGraphInfo + " minus-strand coverage\" description=\"# Reads on minus strand genome-wide\"", file=fileBedGraphCoverageMinus) print( "track type=bedGraph name=\"" + bedGraphInfo + " T->C conversions\" description=\"# T->C conversions on plus strand genome-wide\"", file=fileBedGraphTCConversions) print( "track type=bedGraph name=\"" + bedGraphInfo + " A->G conversions\" description=\"# A->G conversions on minus strand genome-wide\"", file=fileBedGraphAGConversions) print( "track type=bedGraph name=\"" + bedGraphInfo + " T-coverage\" description=\"# Plus-strand reads on Ts genome-wide\"", file=fileBedGraphT) print( "track type=bedGraph name=\"" + bedGraphInfo + " A-coverage\" description=\"# Minus-strand reads on As genome-wide\"", file=fileBedGraphA) for chromosome in chromosomes: chrLength = testFile.getChromosomeLength(chromosome) tcCount = [0] * chrLength agCount = [0] * chrLength coveragePlus = [0] * chrLength coverageMinus = [0] * chrLength tCoverage = [0] * chrLength aCoverage = [0] * chrLength readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold) for read in readIterator: if (not read.isTcRead): read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < chrLength): if read.direction == ReadDirection.Reverse: agCount[mismatch.referencePosition] += 1 else: tcCount[mismatch.referencePosition] += 1 for i in xrange(read.startRefPos, read.endRefPos): if (i >= 0 and i < chrLength): if read.direction == ReadDirection.Reverse: coverageMinus[i] += 1 else: coveragePlus[i] += 1 prevCoveragePlus = 0 prevCoveragePlusPos = 0 prevCoverageMinus = 0 prevCoverageMinusPos = 0 prevTCConversionRate = 0 prevTCConversionRatePos = 0 prevAGConversionRate = 0 prevAGConversionRatePos = 0 prevTCConversions = 0 prevTCConversionPos = 0 prevAGConversions = 0 prevAGConversionPos = 0 prevTCoverage = 0 prevTCoveragePos = 0 prevACoverage = 0 prevACoveragePos = 0 for pos in xrange(0, chrLength): if prevCoveragePlus != coveragePlus[pos]: print(chromosome + "\t" + str(prevCoveragePlusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoveragePlus), file=fileBedGraphCoveragePlus) prevCoveragePlus = coveragePlus[pos] prevCoveragePlusPos = pos if prevCoverageMinus != coverageMinus[pos]: print(chromosome + "\t" + str(prevCoverageMinusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoverageMinus), file=fileBedGraphCoverageMinus) prevCoverageMinus = coverageMinus[pos] prevCoverageMinusPos = pos tCoverage = 0 if coveragePlus[pos] > 0: base = ref.fetch(reference=chromosome, start=pos + 1, end=pos + 2) if base.upper() == "T": tCoverage = coveragePlus[pos] aCoverage = 0 if coverageMinus[pos] > 0: base = ref.fetch(reference=chromosome, start=pos + 1, end=pos + 2) if base.upper() == "A": aCoverage = coverageMinus[pos] if prevTCoverage != tCoverage: print(chromosome + "\t" + str(prevTCoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCoverage), file=fileBedGraphT) prevTCoverage = tCoverage prevTCoveragePos = pos if prevACoverage != aCoverage: print(chromosome + "\t" + str(prevACoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevACoverage), file=fileBedGraphA) prevACoverage = aCoverage prevACoveragePos = pos if prevTCConversions != tcCount[pos]: print(chromosome + "\t" + str(prevTCConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversions), file=fileBedGraphTCConversions) prevTCConversions = tcCount[pos] prevTCConversionPos = pos if prevAGConversions != agCount[pos]: print(chromosome + "\t" + str(prevAGConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversions), file=fileBedGraphAGConversions) prevAGConversions = agCount[pos] prevAGConversionPos = pos TCconversionRate = 0 if coveragePlus[pos] > 0 and coveragePlus[pos] >= coverageCutoff: TCconversionRate = float(tcCount[pos]) / float( coveragePlus[pos]) AGconversionRate = 0 if coverageMinus[pos] > 0 and coverageMinus[pos] >= coverageCutoff: AGconversionRate = float(agCount[pos]) / float( coverageMinus[pos]) if prevTCConversionRate != TCconversionRate: print(chromosome + "\t" + str(prevTCConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversionRate), file=fileBedGraphRatesPlus) prevTCConversionRate = TCconversionRate prevTCConversionRatePos = pos if prevAGConversionRate != AGconversionRate: print(chromosome + "\t" + str(prevAGConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversionRate), file=fileBedGraphRatesMinus) prevAGConversionRate = AGconversionRate prevAGConversionRatePos = pos fileBedGraphRatesPlus.close() fileBedGraphRatesMinus.close() fileBedGraphCoveragePlus.close() fileBedGraphCoverageMinus.close() fileBedGraphTCConversions.close() fileBedGraphAGConversions.close() fileBedGraphT.close() fileBedGraphA.close()
def computeTconversions(ref, bed, snpsFile, bam, maxReadLength, minQual, outputCSV, outputBedgraphPlus, outputBedgraphMinus, conversionThreshold, log, mle=False): referenceFile = pysam.FastaFile(ref) sampleInfo = getSampleInfo(bam) slamseqInfo = SlamSeqInfo(bam) #readNumber = slamseqInfo.MappedReads readNumber = slamseqInfo.FilteredReads bedMD5 = md5(bed) if (mle): fileNameTest = replaceExtension(outputCSV, ".tsv", "_perread") fileTest = open(fileNameTest, 'w') print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=fileTest) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=fileTest) #print("utr", "n", "k", file=fileTest) print(SlamSeqInterval.Header, file=fileTest) fileCSV = open(outputCSV, 'w') print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=fileCSV) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=fileCSV) print(SlamSeqInterval.Header, file=fileCSV) snps = SNPtools.SNPDictionary(snpsFile) snps.read() #Go through one chr after the other testFile = SlamSeqBamFile(bam, ref, snps) if not testFile.bamVersion == __bam_version__: raise RuntimeError("Wrong filtered BAM file version detected (" + testFile.bamVersion + "). Expected version " + __bam_version__ + ". Please rerun slamdunk filter.") bedMD5 = md5(bed) if slamseqInfo.AnnotationMD5 != bedMD5: print( "Warning: MD5 checksum of annotation (" + bedMD5 + ") does not matched MD5 in filtered BAM files (" + slamseqInfo.AnnotationMD5 + "). Most probably the annotation filed changed after the filtered BAM files were created.", file=log) conversionBedGraph = {} for utr in BedIterator(bed): Tcontent = 0 slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, 0, 0, 0, 0, 0, 0, 0) slamSeqUtrMLE = SlamSeqInterval(utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, 0, 0, 0, 0, 0, 0, 0) if (not utr.hasStrand()): raise RuntimeError( "Input BED file does not contain stranded intervals.") if utr.start < 0: raise RuntimeError( "Negativ start coordinate found. Please check the following entry in your BED file: " + utr) # Retreive reference sequence region = utr.chromosome + ":" + str(utr.start + 1) + "-" + str( utr.stop) if (utr.chromosome in list(referenceFile.references)): #print(refRegion,file=sys.stderr) # pysam-0.15.0.1 #refSeq = referenceFile.fetch(region=region).upper() refSeq = referenceFile.fetch(reference=utr.chromosome, start=utr.start, end=utr.stop).upper() if (utr.strand == "-"): #refSeq = complement(refSeq[::-1]) Tcontent = refSeq.count("A") else: Tcontent = refSeq.count("T") slamSeqUtr._Tcontent = Tcontent readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual, conversionThreshold) tcCountUtr = [0] * utr.getLength() coverageUtr = [0] * utr.getLength() tInReads = [] tcInRead = [] countFwd = 0 tcCountFwd = 0 countRev = 0 tCountRev = 0 multiMapFwd = 0 multiMapRev = 0 for read in readIterator: # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions) if (not read.isTcRead): read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 if (read.direction == ReadDirection.Reverse): countRev += 1 if read.tcCount > 0: tCountRev += 1 if read.isMultimapper: multiMapRev += 1 else: countFwd += 1 if read.tcCount > 0: tcCountFwd += 1 if read.isMultimapper: multiMapFwd += 1 for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): tcCountUtr[mismatch.referencePosition] += 1 testN = read.getTcount() testk = 0 for mismatch in read.mismatches: if (mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): if (mismatch.isT(read.direction == ReadDirection.Reverse)): testN += 1 if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): testk += 1 #print(utr.name, read.name, read.direction, testN, testk, read.sequence, sep="\t") tInReads.append(testN) tcInRead.append(testk) #print(utr.name, testN, testk, sep="\t", file=fileTest) for i in xrange(read.startRefPos, read.endRefPos): if (i >= 0 and i < utr.getLength()): coverageUtr[i] += 1 if ((utr.strand == "+" and countFwd > 0) or (utr.strand == "-" and countRev > 0)): tcRateUtr = [ x * 100.0 / y if y > 0 else 0 for x, y in zip(tcCountUtr, coverageUtr) ] readCount = countFwd tcReadCount = tcCountFwd multiMapCount = multiMapFwd if (utr.strand == "-"): readCount = countRev tcReadCount = tCountRev multiMapCount = multiMapRev if ((utr.strand == "-" and countFwd > countRev) or (utr.strand == "+" and countRev > countFwd)): print( "Warning: " + utr.name + " is located on the " + utr.strand + " strand but read counts are higher for the opposite strand (fwd: " + countFwd + ", rev: " + countRev + ")", file=sys.stderr) refSeq = readIterator.getRefSeq() # Get number of covered Ts/As in the UTR and compute average conversion rate for all covered Ts/As coveredTcount = 0 avgConversationRate = 0 coveredPositions = 0 # Get number of reads on T positions and number of reads with T->C conversions on T positions coverageOnTs = 0 conversionsOnTs = 0 for position in xrange(0, len(coverageUtr)): if (coverageUtr[position] > 0 and ((utr.strand == "+" and refSeq[position] == "T") or (utr.strand == "-" and refSeq[position] == "A"))): coveredTcount += 1 avgConversationRate += tcRateUtr[position] coverageOnTs += coverageUtr[position] conversionsOnTs += tcCountUtr[position] conversionBedGraph[utr.chromosome + ":" + str(utr.start + position) + ":" + str(utr.strand)] = tcRateUtr[position] if (coverageUtr[position] > 0): coveredPositions += 1 if (coveredTcount > 0): avgConversationRate = avgConversationRate / coveredTcount else: avgConversationRate = 0 # reads per million mapped to the UTR readsCPM = 0 if (readNumber > 0): readsCPM = readCount * 1000000.0 / readNumber # Convert to SlamSeqInterval and print conversionRate = 0 if (coverageOnTs > 0): conversionRate = float(conversionsOnTs) / float(coverageOnTs) slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, readsCPM, coverageOnTs, conversionsOnTs, conversionRate, readCount, tcReadCount, multiMapCount) slamSeqUtrMLE = SlamSeqInterval( utr.chromosome, utr.start, utr.stop, utr.strand, utr.name, Tcontent, readsCPM, coverageOnTs, conversionsOnTs, conversionRate, ",".join(str(x) for x in tInReads), ",".join(str(x) for x in tcInRead), multiMapCount) print(slamSeqUtr, file=fileCSV) if (mle): print(slamSeqUtrMLE, file=fileTest) fileCSV.close() if (mle): fileTest.close() fileBedgraphPlus = open(outputBedgraphPlus, 'w') fileBedgraphMinus = open(outputBedgraphMinus, 'w') for position in conversionBedGraph: positionData = position.split(":") if (positionData[2] == "+"): print(positionData[0], positionData[1], int(positionData[1]) + 1, conversionBedGraph[position], file=fileBedgraphPlus) else: print(positionData[0], positionData[1], int(positionData[1]) + 1, conversionBedGraph[position], file=fileBedgraphMinus) fileBedgraphPlus.close() fileBedgraphMinus.close() if (mle): fileNameMLE = replaceExtension(outputCSV, ".tsv", "_mle") callR( getPlotter("compute_conversion_rate_mle") + " -f " + fileNameTest + " -r " + "0.024" + " -o " + fileNameMLE + " &> /dev/null")
def computeSNPMaskedRates(ref, bed, snpsFile, bam, maxReadLength, minQual, coverageCutoff, variantFraction, outputCSV, outputPDF, strictTCs, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, ref], [outputCSV], force)): print("Skipped computing T->C per UTR with SNP masking for file " + bam, file=log) else: fileCSV = open(outputCSV, 'w') snps = SNPtools.SNPDictionary(snpsFile) snps.read() #Go through one chr after the other testFile = SlamSeqBamFile(bam, ref, snps) progress = 0 for utr in BedIterator(bed): if (not utr.hasStrand()): raise RuntimeError( "Input BED file does not contain stranded intervals.") if utr.start < 0: raise RuntimeError( "Negativ start coordinate found. Please check the following entry in your BED file: " + utr) readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual) unmaskedTCCount = 0 maskedTCCount = 0 readCount = 0 for read in readIterator: # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions) if (not read.isTcRead and strictTCs): read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 isTC = False isTrueTC = False for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): isTrueTC = True unmasked = False if (read.direction == ReadDirection.Reverse and mismatch.referenceBase == "A" and mismatch.readBase == "G"): unmasked = True elif (read.direction != ReadDirection.Reverse and mismatch.referenceBase == "T" and mismatch.readBase == "C"): unmasked = True if (unmasked and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()): isTC = True readCount += 1 if (isTC): unmaskedTCCount += 1 if (isTrueTC): maskedTCCount += 1 containsSNP = 0 if (unmaskedTCCount != maskedTCCount): containsSNP = 1 print(utr.name + "\t" + str(readCount) + "\t" + str(unmaskedTCCount) + "\t" + str(maskedTCCount) + "\t" + str(containsSNP), file=fileCSV) progress += 1 fileCSV.close() if (not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per UTR position plot for file " + bam, file=log) else: callR(getPlotter("SNPeval") + " -i " + outputCSV + " -c " + str(coverageCutoff) + " -v " + str(variantFraction) + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def tcPerUtr(referenceFile, utrBed, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per UTR position for file " + bam, file=log) else: counter = 0 totalUtrCountFwd = [0] * utrNormFactor totalUtrCountRev = [0] * utrNormFactor tcPerPosRev = [0] * utrNormFactor tcPerPosFwd = [0] * utrNormFactor allPerPosRev = [0] * utrNormFactor allPerPosFwd = [0] * utrNormFactor snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one utr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) for utr in BedIterator(utrBed): readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual) tcForwardCounts = [0] * utrNormFactor mutForwardCounts = [0] * utrNormFactor tcReverseCounts = [0] * utrNormFactor mutReverseCounts = [0] * utrNormFactor for read in readIterator: tcCounts = [0] * utrNormFactor mutCounts = [0] * utrNormFactor for mismatch in read.mismatches: mismatchPos = mismatch.referencePosition # mismatchPos = read.startRefPos if (utr.strand == "+"): # New try for UTRs (remove + 1 if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength()): # if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength() + 1) : mismatchPos = utrNormFactor - (utr.getLength() - mismatchPos) if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): tcCounts[mismatchPos] += 1 else: mutCounts[mismatchPos] += 1 else: if (mismatchPos >= 0 and mismatchPos < min( utr.getLength(), utrNormFactor)): if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): tcCounts[mismatchPos] += 1 else: mutCounts[mismatchPos] += 1 if (read.direction == ReadDirection.Reverse): tcReverseCounts = sumLists(tcReverseCounts, tcCounts) mutReverseCounts = sumLists(mutReverseCounts, mutCounts) start = max( 0, min(min(utr.getLength(), utrNormFactor), read.startRefPos)) end = max( 0, min(min(utr.getLength(), utrNormFactor), read.endRefPos)) for i in range(start, end): totalUtrCountRev[i] += 1 else: tcForwardCounts = sumLists(tcForwardCounts, tcCounts) mutForwardCounts = sumLists(mutForwardCounts, mutCounts) start = min( utr.getLength(), max(utr.getLength() - utrNormFactor, read.startRefPos)) end = min( utr.getLength(), max(utr.getLength() - utrNormFactor, read.endRefPos)) for i in range(start, end): normPos = utrNormFactor - (utr.getLength() - i) totalUtrCountFwd[normPos] += 1 tcPerPosFwd = sumLists(tcPerPosFwd, tcForwardCounts) allPerPosFwd = sumLists(allPerPosFwd, mutForwardCounts) tcPerPosRev = sumLists(tcPerPosRev, tcReverseCounts) allPerPosRev = sumLists(allPerPosRev, mutReverseCounts) counter += 1 if (verbose and counter % 10000 == 0): print("Handled " + str(counter) + " UTRs.", file=log) foTC = open(outputCSV, "w") print("# slamdunk tcperutr v" + __version__, file=foTC) reverseAllPerPosRev = allPerPosRev[::-1] reverseTcPerPosRev = tcPerPosRev[::-1] reverseTotalUtrCountRev = totalUtrCountRev[::-1] for i in range(0, utrNormFactor): print(allPerPosFwd[i], reverseAllPerPosRev[i], tcPerPosFwd[i], reverseTcPerPosRev[i], totalUtrCountFwd[i], reverseTotalUtrCountRev[i], sep='\t', file=foTC) foTC.close() if (not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per UTR position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -u -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: totalReadCountFwd = [0] * maxReadLength totalReadCountRev = [0] * maxReadLength tcPerPosRev = [0] * maxReadLength tcPerPosFwd = [0] * maxReadLength allPerPosRev = [0] * maxReadLength allPerPosFwd = [0] * maxReadLength snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minQual) for read in readIterator: tcCounts = [0] * maxReadLength mutCounts = [0] * maxReadLength for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): tcCounts[mismatch.readPosition] += 1 else: mutCounts[mismatch.readPosition] += 1 query_length = len(read.sequence) if (read.direction == ReadDirection.Reverse): tcPerPosRev = sumLists(tcPerPosRev, tcCounts) allPerPosRev = sumLists(allPerPosRev, mutCounts) for i in range(0, query_length): totalReadCountRev[i] += 1 else: tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts) allPerPosFwd = sumLists(allPerPosFwd, mutCounts) for i in range(0, query_length): totalReadCountFwd[i] += 1 foTC = open(outputCSV, "w") print("# slamdunk tcperreadpos v" + __version__, file=foTC) for i in range(0, maxReadLength): print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC) foTC.close() if (not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per reads position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def computeTconversionsAll( ref, snpsFile, bam, outputBedgraphPlus, outputBedgraphPlusNew, outputBedgraphMinus, outputBedgraphMinusNew, conversionThreshold, minQual, is_inverse, log, ): def to_bed_graph(c, data, bedgraph, rn): data /= rn data *= 1000000.0 [print(c, i, i+1, d, file=bedgraph) for i, d in enumerate(data)] chroms_fw = { 'chrI': np.zeros(230218).astype('float32'), 'chrII': np.zeros(813184).astype('float32'), 'chrIII': np.zeros(316620).astype('float32'), 'chrIV': np.zeros(1531933).astype('float32'), 'chrIX': np.zeros(439888).astype('float32'), 'chrM': np.zeros(85779).astype('float32'), 'chrV': np.zeros(576874).astype('float32'), 'chrVI': np.zeros(270161).astype('float32'), 'chrVII': np.zeros(1090940).astype('float32'), 'chrVIII': np.zeros(562643).astype('float32'), 'chrX': np.zeros(745751).astype('float32'), 'chrXI': np.zeros(666816).astype('float32'), 'chrXII': np.zeros(1078177).astype('float32'), 'chrXIII': np.zeros(924431).astype('float32'), 'chrXIV': np.zeros(784333).astype('float32'), 'chrXV': np.zeros(1091291).astype('float32'), 'chrXVI': np.zeros(948066).astype('float32') } chroms_bw = copy.deepcopy(chroms_fw) chroms_fw_new = copy.deepcopy(chroms_fw.copy()) chroms_bw_new = copy.deepcopy(chroms_fw.copy()) readNumber, positiveCount, negativeCount, positiveCountNew, negativeCountNew = 0, 0, 0, 0, 0 bamFile = pysam.AlignmentFile(bam, "rb") if bamFile.header['HD']['SO'] != 'queryname': # Sort bam file sbam = replaceExtension(bam, '.bam', '_sorted') if not os.path.exists(sbam): run( 'samtools sort -n %s -o %s' % (bam, sbam), log ) else: sbam = bam bamFile = pysam.AlignmentFile(sbam, "rb") snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other seqIter = SlamSeqIter(bamFile, ref, snps, conversionThreshold, minQual) read1 = None read2 = None for read in seqIter: if not read.isPaired or read.unmappedMate or read.duplicate: continue if read.isSecondRead: read2 = read else: read1 = read read2 = None continue if read1 is None or read2 is None or read1.queryName != read2.queryName: continue readNumber += 1 chrom = read1.chromosome start = np.minimum(read1.startRefPos, read2.startRefPos) end = np.maximum(read2.endRefPos, read2.endRefPos) is_tc_read = read1.isTcRead or read2.isTcRead direction_read = read1 if not is_inverse else read2 if direction_read.direction == ReadDirection.Forward: positiveCount += 1 chroms_fw[chrom][start:end] += 1 if is_tc_read: positiveCountNew += 1 chroms_fw_new[chrom][start:end] += 1 else: negativeCount += 1 chroms_bw[chrom][start:end] += 1 if is_tc_read: negativeCountNew += 1 chroms_bw_new[chrom][start:end] += 1 print("Total reads: %s\n" "Positive reads: %s\n" "Positive reads new: %s\n" "Negative reads: %s\n" "Negative reads new: %s" % (readNumber, positiveCount, positiveCountNew, negativeCount, negativeCountNew), file=log) fileBedgraphPlus = open(outputBedgraphPlus, 'w') fileBedgraphPlusNew = open(outputBedgraphPlusNew, 'w') fileBedgraphMinus = open(outputBedgraphMinus, 'w') fileBedgraphMinusNew = open(outputBedgraphMinusNew, 'w') for chrom in chroms_fw.keys(): to_bed_graph(chrom, chroms_fw[chrom], fileBedgraphPlus, readNumber) to_bed_graph(chrom, chroms_bw[chrom], fileBedgraphMinus, readNumber) to_bed_graph(chrom, chroms_fw_new[chrom], fileBedgraphPlusNew, readNumber) to_bed_graph(chrom, chroms_bw_new[chrom], fileBedgraphMinusNew, readNumber) fileBedgraphPlus.close() fileBedgraphPlusNew.close() fileBedgraphMinus.close() fileBedgraphMinusNew.close()