Пример #1
0
def dumpReadInfo(referenceFile,
                 bam,
                 minQual,
                 outputCSV,
                 snpsFile,
                 log,
                 printOnly=False,
                 verbose=True,
                 force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam,
              file=log)
    else:

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        outputFile = SlamSeqWriter(outputCSV)

        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome)
            for read in readIterator:
                outputFile.write(read)

        outputFile.close()
Пример #2
0
def getConversionRateFromBam(bam, ref, chromosome, start, end, strand):

    testFile = SlamSeqBamFile(bam, ref, SNPtools.SNPDictionary(None))

    sumConversionRate = 0
    readCount = 0
    #for chromosome in testFile.getChromosomes():
    #    readIterator = testFile.readsInChromosome(chromosome)
    #readIterator = testFile.readInRegion("chr7", 3217778, 3221036, "+", 55)
    readIterator = testFile.readInRegion(chromosome, start, end, strand, 100)

    for read in readIterator:
        conversionRate = 0
        if (read.tCount > 0):
            conversionRate = read.tcCount * 1.0 / read.tCount

        #if(read.tcCount > 0):
        sumConversionRate += conversionRate
        readCount += 1

        #if(readCount % 1000 == 0 and readCount > 0):
        #    print(str(readCount) + ": " + str(sumConversionRate) + " / " + str(readCount) + " = " + str(sumConversionRate / readCount))
        #if(readCount >= 10000):
        #    break

    print("Read count: " + str(readCount))
    print("Avg. conversion rate: " + str(sumConversionRate / readCount))
Пример #3
0
def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual,
                             outputBAMPrefix, conversionThreshold, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    samFile = pysam.AlignmentFile(bam, "rb")

    chromosomes = testFile.getChromosomes()

    backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam"
    tcReadFileName = outputBAMPrefix + "_TCReads.bam"

    backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName,
                                             "wb",
                                             template=samFile)
    tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile)

    tcReadDict = dict()

    for chromosome in chromosomes:

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual,
                                                  conversionThreshold)

        for read in readIterator:
            if (read.isTcRead):
                tcReadDict[read.name] = 0

    for read in samFile.fetch():
        if read.query_name in tcReadDict:
            tcReadFile.write(read)
        else:
            backgroundReadFile.write(read)

    backgroundReadFile.close()
    tcReadFile.close()

    pysamIndex(backgroundReadFileName)
    pysamIndex(tcReadFileName)
Пример #4
0
def genomewideConversionRates(referenceFile, snpsFile, bam, minBaseQual,
                              outputBedGraphPrefix, conversionThreshold,
                              coverageCutoff, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    chromosomes = testFile.getChromosomes()

    bedGraphInfo = re.sub("_slamdunk_mapped.*", "",
                          basename(outputBedGraphPrefix))
    print(bedGraphInfo)

    fileBedGraphRatesPlus = open(
        outputBedGraphPrefix + "_TC_rates_genomewide.bedGraph", 'w')
    fileBedGraphRatesMinus = open(
        outputBedGraphPrefix + "_AG_rates_genomewide.bedGraph", 'w')
    fileBedGraphCoveragePlus = open(
        outputBedGraphPrefix + "_coverage_plus_genomewide.bedGraph", 'w')
    fileBedGraphCoverageMinus = open(
        outputBedGraphPrefix + "_coverage_minus_genomewide.bedGraph", 'w')
    fileBedGraphTCConversions = open(
        outputBedGraphPrefix + "_TC_conversions_genomewide.bedGraph", 'w')
    fileBedGraphAGConversions = open(
        outputBedGraphPrefix + "_AG_conversions_genomewide.bedGraph", 'w')
    fileBedGraphT = open(
        outputBedGraphPrefix + "_coverage_T_genomewide.bedGraph", 'w')
    fileBedGraphA = open(
        outputBedGraphPrefix + "_coverage_A_genomewide.bedGraph", 'w')

    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " tc-conversions\" description=\"# T->C conversions / # reads on T per position genome-wide\"",
        file=fileBedGraphRatesPlus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " ag-conversions\" description=\"# A->G conversions / # reads on A per position genome-wide\"",
        file=fileBedGraphRatesMinus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " plus-strand coverage\" description=\"# Reads on plus strand genome-wide\"",
        file=fileBedGraphCoveragePlus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " minus-strand coverage\" description=\"# Reads on minus strand genome-wide\"",
        file=fileBedGraphCoverageMinus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " T->C conversions\" description=\"# T->C conversions on plus strand genome-wide\"",
        file=fileBedGraphTCConversions)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " A->G conversions\" description=\"# A->G conversions on minus strand genome-wide\"",
        file=fileBedGraphAGConversions)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " T-coverage\" description=\"# Plus-strand reads on Ts genome-wide\"",
        file=fileBedGraphT)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " A-coverage\" description=\"# Minus-strand reads on As genome-wide\"",
        file=fileBedGraphA)

    for chromosome in chromosomes:

        chrLength = testFile.getChromosomeLength(chromosome)

        tcCount = [0] * chrLength
        agCount = [0] * chrLength

        coveragePlus = [0] * chrLength
        coverageMinus = [0] * chrLength

        tCoverage = [0] * chrLength
        aCoverage = [0] * chrLength

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual,
                                                  conversionThreshold)

        for read in readIterator:
            if (not read.isTcRead):
                read.tcCount = 0
                read.mismatches = []
                read.conversionRates = 0.0
                read.tcRate = 0.0

            for mismatch in read.mismatches:
                if (mismatch.isTCMismatch(
                        read.direction == ReadDirection.Reverse)
                        and mismatch.referencePosition >= 0
                        and mismatch.referencePosition < chrLength):
                    if read.direction == ReadDirection.Reverse:
                        agCount[mismatch.referencePosition] += 1
                    else:
                        tcCount[mismatch.referencePosition] += 1

            for i in xrange(read.startRefPos, read.endRefPos):
                if (i >= 0 and i < chrLength):
                    if read.direction == ReadDirection.Reverse:
                        coverageMinus[i] += 1
                    else:
                        coveragePlus[i] += 1

        prevCoveragePlus = 0
        prevCoveragePlusPos = 0
        prevCoverageMinus = 0
        prevCoverageMinusPos = 0
        prevTCConversionRate = 0
        prevTCConversionRatePos = 0
        prevAGConversionRate = 0
        prevAGConversionRatePos = 0
        prevTCConversions = 0
        prevTCConversionPos = 0
        prevAGConversions = 0
        prevAGConversionPos = 0
        prevTCoverage = 0
        prevTCoveragePos = 0
        prevACoverage = 0
        prevACoveragePos = 0

        for pos in xrange(0, chrLength):
            if prevCoveragePlus != coveragePlus[pos]:
                print(chromosome + "\t" + str(prevCoveragePlusPos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevCoveragePlus),
                      file=fileBedGraphCoveragePlus)
                prevCoveragePlus = coveragePlus[pos]
                prevCoveragePlusPos = pos
            if prevCoverageMinus != coverageMinus[pos]:
                print(chromosome + "\t" + str(prevCoverageMinusPos + 1) +
                      "\t" + str(pos + 1) + "\t" + str(prevCoverageMinus),
                      file=fileBedGraphCoverageMinus)
                prevCoverageMinus = coverageMinus[pos]
                prevCoverageMinusPos = pos

            tCoverage = 0

            if coveragePlus[pos] > 0:
                base = ref.fetch(reference=chromosome,
                                 start=pos + 1,
                                 end=pos + 2)
                if base.upper() == "T":
                    tCoverage = coveragePlus[pos]

            aCoverage = 0

            if coverageMinus[pos] > 0:
                base = ref.fetch(reference=chromosome,
                                 start=pos + 1,
                                 end=pos + 2)
                if base.upper() == "A":
                    aCoverage = coverageMinus[pos]

            if prevTCoverage != tCoverage:
                print(chromosome + "\t" + str(prevTCoveragePos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevTCoverage),
                      file=fileBedGraphT)
                prevTCoverage = tCoverage
                prevTCoveragePos = pos

            if prevACoverage != aCoverage:
                print(chromosome + "\t" + str(prevACoveragePos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevACoverage),
                      file=fileBedGraphA)
                prevACoverage = aCoverage
                prevACoveragePos = pos

            if prevTCConversions != tcCount[pos]:
                print(chromosome + "\t" + str(prevTCConversionPos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevTCConversions),
                      file=fileBedGraphTCConversions)
                prevTCConversions = tcCount[pos]
                prevTCConversionPos = pos

            if prevAGConversions != agCount[pos]:
                print(chromosome + "\t" + str(prevAGConversionPos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevAGConversions),
                      file=fileBedGraphAGConversions)
                prevAGConversions = agCount[pos]
                prevAGConversionPos = pos

            TCconversionRate = 0
            if coveragePlus[pos] > 0 and coveragePlus[pos] >= coverageCutoff:
                TCconversionRate = float(tcCount[pos]) / float(
                    coveragePlus[pos])

            AGconversionRate = 0
            if coverageMinus[pos] > 0 and coverageMinus[pos] >= coverageCutoff:
                AGconversionRate = float(agCount[pos]) / float(
                    coverageMinus[pos])

            if prevTCConversionRate != TCconversionRate:
                print(chromosome + "\t" + str(prevTCConversionRatePos + 1) +
                      "\t" + str(pos + 1) + "\t" + str(prevTCConversionRate),
                      file=fileBedGraphRatesPlus)
                prevTCConversionRate = TCconversionRate
                prevTCConversionRatePos = pos

            if prevAGConversionRate != AGconversionRate:
                print(chromosome + "\t" + str(prevAGConversionRatePos + 1) +
                      "\t" + str(pos + 1) + "\t" + str(prevAGConversionRate),
                      file=fileBedGraphRatesMinus)
                prevAGConversionRate = AGconversionRate
                prevAGConversionRatePos = pos

    fileBedGraphRatesPlus.close()
    fileBedGraphRatesMinus.close()
    fileBedGraphCoveragePlus.close()
    fileBedGraphCoverageMinus.close()
    fileBedGraphTCConversions.close()
    fileBedGraphAGConversions.close()
    fileBedGraphT.close()
    fileBedGraphA.close()
Пример #5
0
def computeTconversions(ref,
                        bed,
                        snpsFile,
                        bam,
                        maxReadLength,
                        minQual,
                        outputCSV,
                        outputBedgraphPlus,
                        outputBedgraphMinus,
                        conversionThreshold,
                        log,
                        mle=False):

    referenceFile = pysam.FastaFile(ref)

    sampleInfo = getSampleInfo(bam)

    slamseqInfo = SlamSeqInfo(bam)
    #readNumber = slamseqInfo.MappedReads
    readNumber = slamseqInfo.FilteredReads

    bedMD5 = md5(bed)

    if (mle):
        fileNameTest = replaceExtension(outputCSV, ".tsv", "_perread")
        fileTest = open(fileNameTest, 'w')
        print("#slamdunk v" + __version__,
              __count_version__,
              "sample info:",
              sampleInfo.Name,
              sampleInfo.ID,
              sampleInfo.Type,
              sampleInfo.Time,
              sep="\t",
              file=fileTest)
        print("#annotation:",
              os.path.basename(bed),
              bedMD5,
              sep="\t",
              file=fileTest)
        #print("utr", "n", "k", file=fileTest)
        print(SlamSeqInterval.Header, file=fileTest)

    fileCSV = open(outputCSV, 'w')
    print("#slamdunk v" + __version__,
          __count_version__,
          "sample info:",
          sampleInfo.Name,
          sampleInfo.ID,
          sampleInfo.Type,
          sampleInfo.Time,
          sep="\t",
          file=fileCSV)
    print("#annotation:",
          os.path.basename(bed),
          bedMD5,
          sep="\t",
          file=fileCSV)
    print(SlamSeqInterval.Header, file=fileCSV)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    #Go through one chr after the other
    testFile = SlamSeqBamFile(bam, ref, snps)
    if not testFile.bamVersion == __bam_version__:
        raise RuntimeError("Wrong filtered BAM file version detected (" +
                           testFile.bamVersion + "). Expected version " +
                           __bam_version__ + ". Please rerun slamdunk filter.")

    bedMD5 = md5(bed)
    if slamseqInfo.AnnotationMD5 != bedMD5:
        print(
            "Warning: MD5 checksum of annotation (" + bedMD5 +
            ") does not matched MD5 in filtered BAM files (" +
            slamseqInfo.AnnotationMD5 +
            "). Most probably the annotation filed changed after the filtered BAM files were created.",
            file=log)

    conversionBedGraph = {}

    for utr in BedIterator(bed):
        Tcontent = 0
        slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                     utr.strand, utr.name, Tcontent, 0, 0, 0,
                                     0, 0, 0, 0)
        slamSeqUtrMLE = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                        utr.strand, utr.name, Tcontent, 0, 0,
                                        0, 0, 0, 0, 0)
        if (not utr.hasStrand()):
            raise RuntimeError(
                "Input BED file does not contain stranded intervals.")

        if utr.start < 0:
            raise RuntimeError(
                "Negativ start coordinate found. Please check the following entry in your BED file: "
                + utr)
        # Retreive reference sequence
        region = utr.chromosome + ":" + str(utr.start + 1) + "-" + str(
            utr.stop)

        if (utr.chromosome in list(referenceFile.references)):
            #print(refRegion,file=sys.stderr)
            # pysam-0.15.0.1
            #refSeq = referenceFile.fetch(region=region).upper()
            refSeq = referenceFile.fetch(reference=utr.chromosome,
                                         start=utr.start,
                                         end=utr.stop).upper()
            if (utr.strand == "-"):
                #refSeq = complement(refSeq[::-1])
                Tcontent = refSeq.count("A")
            else:
                Tcontent = refSeq.count("T")

            slamSeqUtr._Tcontent = Tcontent

        readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                             utr.stop, utr.strand,
                                             maxReadLength, minQual,
                                             conversionThreshold)

        tcCountUtr = [0] * utr.getLength()
        coverageUtr = [0] * utr.getLength()

        tInReads = []
        tcInRead = []

        countFwd = 0
        tcCountFwd = 0
        countRev = 0
        tCountRev = 0

        multiMapFwd = 0
        multiMapRev = 0

        for read in readIterator:

            # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
            if (not read.isTcRead):
                read.tcCount = 0
                read.mismatches = []
                read.conversionRates = 0.0
                read.tcRate = 0.0

            if (read.direction == ReadDirection.Reverse):
                countRev += 1
                if read.tcCount > 0:
                    tCountRev += 1
                if read.isMultimapper:
                    multiMapRev += 1
            else:
                countFwd += 1
                if read.tcCount > 0:
                    tcCountFwd += 1
                if read.isMultimapper:
                    multiMapFwd += 1

            for mismatch in read.mismatches:
                if (mismatch.isTCMismatch(
                        read.direction == ReadDirection.Reverse)
                        and mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    tcCountUtr[mismatch.referencePosition] += 1

            testN = read.getTcount()
            testk = 0
            for mismatch in read.mismatches:
                if (mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    if (mismatch.isT(read.direction == ReadDirection.Reverse)):
                        testN += 1
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)):
                        testk += 1
            #print(utr.name, read.name, read.direction, testN, testk, read.sequence, sep="\t")
            tInReads.append(testN)
            tcInRead.append(testk)
            #print(utr.name, testN, testk, sep="\t", file=fileTest)

            for i in xrange(read.startRefPos, read.endRefPos):
                if (i >= 0 and i < utr.getLength()):
                    coverageUtr[i] += 1

        if ((utr.strand == "+" and countFwd > 0)
                or (utr.strand == "-" and countRev > 0)):
            tcRateUtr = [
                x * 100.0 / y if y > 0 else 0
                for x, y in zip(tcCountUtr, coverageUtr)
            ]

            readCount = countFwd
            tcReadCount = tcCountFwd
            multiMapCount = multiMapFwd

            if (utr.strand == "-"):
                readCount = countRev
                tcReadCount = tCountRev
                multiMapCount = multiMapRev

            if ((utr.strand == "-" and countFwd > countRev)
                    or (utr.strand == "+" and countRev > countFwd)):
                print(
                    "Warning: " + utr.name + " is located on the " +
                    utr.strand +
                    " strand but read counts are higher for the opposite strand (fwd: "
                    + countFwd + ", rev: " + countRev + ")",
                    file=sys.stderr)

            refSeq = readIterator.getRefSeq()

            # Get number of covered Ts/As in the UTR and compute average conversion rate for all covered Ts/As
            coveredTcount = 0
            avgConversationRate = 0
            coveredPositions = 0
            # Get number of reads on T positions and number of reads with T->C conversions on T positions
            coverageOnTs = 0
            conversionsOnTs = 0

            for position in xrange(0, len(coverageUtr)):

                if (coverageUtr[position] > 0
                        and ((utr.strand == "+" and refSeq[position] == "T") or
                             (utr.strand == "-" and refSeq[position] == "A"))):
                    coveredTcount += 1
                    avgConversationRate += tcRateUtr[position]

                    coverageOnTs += coverageUtr[position]
                    conversionsOnTs += tcCountUtr[position]
                    conversionBedGraph[utr.chromosome + ":" +
                                       str(utr.start + position) + ":" +
                                       str(utr.strand)] = tcRateUtr[position]
                if (coverageUtr[position] > 0):
                    coveredPositions += 1

            if (coveredTcount > 0):
                avgConversationRate = avgConversationRate / coveredTcount
            else:
                avgConversationRate = 0

            # reads per million mapped to the UTR
            readsCPM = 0
            if (readNumber > 0):
                readsCPM = readCount * 1000000.0 / readNumber

            # Convert to SlamSeqInterval and print
            conversionRate = 0
            if (coverageOnTs > 0):
                conversionRate = float(conversionsOnTs) / float(coverageOnTs)
            slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                         utr.strand, utr.name, Tcontent,
                                         readsCPM, coverageOnTs,
                                         conversionsOnTs, conversionRate,
                                         readCount, tcReadCount, multiMapCount)
            slamSeqUtrMLE = SlamSeqInterval(
                utr.chromosome, utr.start, utr.stop, utr.strand, utr.name,
                Tcontent, readsCPM, coverageOnTs, conversionsOnTs,
                conversionRate, ",".join(str(x) for x in tInReads),
                ",".join(str(x) for x in tcInRead), multiMapCount)

        print(slamSeqUtr, file=fileCSV)
        if (mle):
            print(slamSeqUtrMLE, file=fileTest)

    fileCSV.close()
    if (mle):
        fileTest.close()

    fileBedgraphPlus = open(outputBedgraphPlus, 'w')
    fileBedgraphMinus = open(outputBedgraphMinus, 'w')

    for position in conversionBedGraph:
        positionData = position.split(":")
        if (positionData[2] == "+"):
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphPlus)
        else:
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphMinus)

    fileBedgraphPlus.close()
    fileBedgraphMinus.close()

    if (mle):
        fileNameMLE = replaceExtension(outputCSV, ".tsv", "_mle")
        callR(
            getPlotter("compute_conversion_rate_mle") + " -f " + fileNameTest +
            " -r " + "0.024" + " -o " + fileNameMLE + " &> /dev/null")
Пример #6
0
def computeSNPMaskedRates(ref,
                          bed,
                          snpsFile,
                          bam,
                          maxReadLength,
                          minQual,
                          coverageCutoff,
                          variantFraction,
                          outputCSV,
                          outputPDF,
                          strictTCs,
                          log,
                          printOnly=False,
                          verbose=True,
                          force=False):

    if (not checkStep([bam, ref], [outputCSV], force)):
        print("Skipped computing T->C per UTR with SNP masking for file " +
              bam,
              file=log)
    else:
        fileCSV = open(outputCSV, 'w')

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, ref, snps)

        progress = 0
        for utr in BedIterator(bed):

            if (not utr.hasStrand()):
                raise RuntimeError(
                    "Input BED file does not contain stranded intervals.")

            if utr.start < 0:
                raise RuntimeError(
                    "Negativ start coordinate found. Please check the following entry in your BED file: "
                    + utr)

            readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                                 utr.stop, utr.strand,
                                                 maxReadLength, minQual)

            unmaskedTCCount = 0
            maskedTCCount = 0
            readCount = 0

            for read in readIterator:

                # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
                if (not read.isTcRead and strictTCs):
                    read.tcCount = 0
                    read.mismatches = []
                    read.conversionRates = 0.0
                    read.tcRate = 0.0

                isTC = False
                isTrueTC = False

                for mismatch in read.mismatches:
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)
                            and mismatch.referencePosition >= 0
                            and mismatch.referencePosition < utr.getLength()):
                        isTrueTC = True

                    unmasked = False
                    if (read.direction == ReadDirection.Reverse
                            and mismatch.referenceBase == "A"
                            and mismatch.readBase == "G"):
                        unmasked = True
                    elif (read.direction != ReadDirection.Reverse
                          and mismatch.referenceBase == "T"
                          and mismatch.readBase == "C"):
                        unmasked = True

                    if (unmasked and mismatch.referencePosition >= 0
                            and mismatch.referencePosition < utr.getLength()):
                        isTC = True

                readCount += 1

                if (isTC):
                    unmaskedTCCount += 1

                if (isTrueTC):
                    maskedTCCount += 1

            containsSNP = 0

            if (unmaskedTCCount != maskedTCCount):
                containsSNP = 1

            print(utr.name + "\t" + str(readCount) + "\t" +
                  str(unmaskedTCCount) + "\t" + str(maskedTCCount) + "\t" +
                  str(containsSNP),
                  file=fileCSV)

            progress += 1

        fileCSV.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("SNPeval") + " -i " + outputCSV + " -c " +
              str(coverageCutoff) + " -v " + str(variantFraction) + " -o " +
              outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #7
0
def tcPerUtr(referenceFile,
             utrBed,
             bam,
             minQual,
             maxReadLength,
             outputCSV,
             outputPDF,
             snpsFile,
             log,
             printOnly=False,
             verbose=True,
             force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per UTR position for file " + bam,
              file=log)
    else:

        counter = 0

        totalUtrCountFwd = [0] * utrNormFactor
        totalUtrCountRev = [0] * utrNormFactor

        tcPerPosRev = [0] * utrNormFactor
        tcPerPosFwd = [0] * utrNormFactor

        allPerPosRev = [0] * utrNormFactor
        allPerPosFwd = [0] * utrNormFactor

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        # Go through one utr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        for utr in BedIterator(utrBed):

            readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                                 utr.stop, utr.strand,
                                                 maxReadLength, minQual)

            tcForwardCounts = [0] * utrNormFactor
            mutForwardCounts = [0] * utrNormFactor
            tcReverseCounts = [0] * utrNormFactor
            mutReverseCounts = [0] * utrNormFactor

            for read in readIterator:

                tcCounts = [0] * utrNormFactor
                mutCounts = [0] * utrNormFactor

                for mismatch in read.mismatches:

                    mismatchPos = mismatch.referencePosition

                    # mismatchPos = read.startRefPos

                    if (utr.strand == "+"):

                        # New try for UTRs (remove + 1
                        if (mismatchPos >= (utr.getLength() - utrNormFactor)
                                and mismatchPos < utr.getLength()):
                            # if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength() + 1) :
                            mismatchPos = utrNormFactor - (utr.getLength() -
                                                           mismatchPos)

                            if (mismatch.isTCMismatch(
                                    read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else:
                                mutCounts[mismatchPos] += 1
                    else:

                        if (mismatchPos >= 0 and mismatchPos < min(
                                utr.getLength(), utrNormFactor)):
                            if (mismatch.isTCMismatch(
                                    read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else:
                                mutCounts[mismatchPos] += 1

                if (read.direction == ReadDirection.Reverse):

                    tcReverseCounts = sumLists(tcReverseCounts, tcCounts)
                    mutReverseCounts = sumLists(mutReverseCounts, mutCounts)

                    start = max(
                        0,
                        min(min(utr.getLength(), utrNormFactor),
                            read.startRefPos))
                    end = max(
                        0,
                        min(min(utr.getLength(), utrNormFactor),
                            read.endRefPos))

                    for i in range(start, end):

                        totalUtrCountRev[i] += 1

                else:

                    tcForwardCounts = sumLists(tcForwardCounts, tcCounts)
                    mutForwardCounts = sumLists(mutForwardCounts, mutCounts)

                    start = min(
                        utr.getLength(),
                        max(utr.getLength() - utrNormFactor, read.startRefPos))
                    end = min(
                        utr.getLength(),
                        max(utr.getLength() - utrNormFactor, read.endRefPos))

                    for i in range(start, end):
                        normPos = utrNormFactor - (utr.getLength() - i)
                        totalUtrCountFwd[normPos] += 1

            tcPerPosFwd = sumLists(tcPerPosFwd, tcForwardCounts)
            allPerPosFwd = sumLists(allPerPosFwd, mutForwardCounts)

            tcPerPosRev = sumLists(tcPerPosRev, tcReverseCounts)
            allPerPosRev = sumLists(allPerPosRev, mutReverseCounts)

            counter += 1

            if (verbose and counter % 10000 == 0):
                print("Handled " + str(counter) + " UTRs.", file=log)

        foTC = open(outputCSV, "w")

        print("# slamdunk tcperutr v" + __version__, file=foTC)

        reverseAllPerPosRev = allPerPosRev[::-1]
        reverseTcPerPosRev = tcPerPosRev[::-1]
        reverseTotalUtrCountRev = totalUtrCountRev[::-1]

        for i in range(0, utrNormFactor):
            print(allPerPosFwd[i],
                  reverseAllPerPosRev[i],
                  tcPerPosFwd[i],
                  reverseTcPerPosRev[i],
                  totalUtrCountFwd[i],
                  reverseTotalUtrCountRev[i],
                  sep='\t',
                  file=foTC)
        foTC.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("conversion_per_read_position") + " -u -i " +
              outputCSV + " -o " + outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #8
0
def tcPerReadPos(referenceFile,
                 bam,
                 minQual,
                 maxReadLength,
                 outputCSV,
                 outputPDF,
                 snpsFile,
                 log,
                 printOnly=False,
                 verbose=True,
                 force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam,
              file=log)
    else:

        totalReadCountFwd = [0] * maxReadLength
        totalReadCountRev = [0] * maxReadLength

        tcPerPosRev = [0] * maxReadLength
        tcPerPosFwd = [0] * maxReadLength

        allPerPosRev = [0] * maxReadLength
        allPerPosFwd = [0] * maxReadLength

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minQual)

            for read in readIterator:

                tcCounts = [0] * maxReadLength
                mutCounts = [0] * maxReadLength

                for mismatch in read.mismatches:
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)):
                        tcCounts[mismatch.readPosition] += 1
                    else:
                        mutCounts[mismatch.readPosition] += 1

                query_length = len(read.sequence)
                if (read.direction == ReadDirection.Reverse):
                    tcPerPosRev = sumLists(tcPerPosRev, tcCounts)
                    allPerPosRev = sumLists(allPerPosRev, mutCounts)

                    for i in range(0, query_length):
                        totalReadCountRev[i] += 1
                else:
                    tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts)
                    allPerPosFwd = sumLists(allPerPosFwd, mutCounts)

                    for i in range(0, query_length):
                        totalReadCountFwd[i] += 1

        foTC = open(outputCSV, "w")

        print("# slamdunk tcperreadpos v" + __version__, file=foTC)

        for i in range(0, maxReadLength):
            print(allPerPosFwd[i],
                  allPerPosRev[i],
                  tcPerPosFwd[i],
                  tcPerPosRev[i],
                  totalReadCountFwd[i],
                  totalReadCountRev[i],
                  sep='\t',
                  file=foTC)
        foTC.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per reads position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV +
              " -o " + outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #9
0
def computeTconversionsAll(
        ref,
        snpsFile,
        bam,
        outputBedgraphPlus,
        outputBedgraphPlusNew,
        outputBedgraphMinus,
        outputBedgraphMinusNew,
        conversionThreshold,
        minQual,
        is_inverse,
        log,
):
    def to_bed_graph(c, data, bedgraph, rn):
        data /= rn
        data *= 1000000.0
        [print(c, i, i+1, d, file=bedgraph) for i, d in enumerate(data)]

    chroms_fw = {
        'chrI': np.zeros(230218).astype('float32'),
        'chrII': np.zeros(813184).astype('float32'),
        'chrIII': np.zeros(316620).astype('float32'),
        'chrIV': np.zeros(1531933).astype('float32'),
        'chrIX': np.zeros(439888).astype('float32'),
        'chrM': np.zeros(85779).astype('float32'),
        'chrV': np.zeros(576874).astype('float32'),
        'chrVI': np.zeros(270161).astype('float32'),
        'chrVII': np.zeros(1090940).astype('float32'),
        'chrVIII': np.zeros(562643).astype('float32'),
        'chrX': np.zeros(745751).astype('float32'),
        'chrXI': np.zeros(666816).astype('float32'),
        'chrXII': np.zeros(1078177).astype('float32'),
        'chrXIII': np.zeros(924431).astype('float32'),
        'chrXIV': np.zeros(784333).astype('float32'),
        'chrXV': np.zeros(1091291).astype('float32'),
        'chrXVI': np.zeros(948066).astype('float32')
    }
    chroms_bw = copy.deepcopy(chroms_fw)
    chroms_fw_new = copy.deepcopy(chroms_fw.copy())
    chroms_bw_new = copy.deepcopy(chroms_fw.copy())
    readNumber, positiveCount, negativeCount, positiveCountNew, negativeCountNew = 0, 0, 0, 0, 0
    bamFile = pysam.AlignmentFile(bam, "rb")
    if bamFile.header['HD']['SO'] != 'queryname':
        # Sort bam file
        sbam = replaceExtension(bam, '.bam', '_sorted')
        if not os.path.exists(sbam):
            run(
                'samtools sort -n %s -o %s' % (bam, sbam),
                log
            )
    else:
        sbam = bam

    bamFile = pysam.AlignmentFile(sbam, "rb")
    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    seqIter = SlamSeqIter(bamFile, ref, snps, conversionThreshold, minQual)
    read1 = None
    read2 = None
    for read in seqIter:
        if not read.isPaired or read.unmappedMate or read.duplicate:
            continue
        if read.isSecondRead:
            read2 = read
        else:
            read1 = read
            read2 = None
            continue
        if read1 is None or read2 is None or read1.queryName != read2.queryName:
            continue
        readNumber += 1
        chrom = read1.chromosome
        start = np.minimum(read1.startRefPos, read2.startRefPos)
        end = np.maximum(read2.endRefPos, read2.endRefPos)
        is_tc_read = read1.isTcRead or read2.isTcRead
        direction_read = read1 if not is_inverse else read2
        if direction_read.direction == ReadDirection.Forward:
            positiveCount += 1
            chroms_fw[chrom][start:end] += 1
            if is_tc_read:
                positiveCountNew += 1
                chroms_fw_new[chrom][start:end] += 1
        else:
            negativeCount += 1
            chroms_bw[chrom][start:end] += 1
            if is_tc_read:
                negativeCountNew += 1
                chroms_bw_new[chrom][start:end] += 1

    print("Total reads: %s\n"
          "Positive reads: %s\n"
          "Positive reads new: %s\n"
          "Negative reads: %s\n"
          "Negative reads new: %s" %
          (readNumber, positiveCount, positiveCountNew, negativeCount, negativeCountNew),
          file=log)
    fileBedgraphPlus = open(outputBedgraphPlus, 'w')
    fileBedgraphPlusNew = open(outputBedgraphPlusNew, 'w')
    fileBedgraphMinus = open(outputBedgraphMinus, 'w')
    fileBedgraphMinusNew = open(outputBedgraphMinusNew, 'w')
    for chrom in chroms_fw.keys():
        to_bed_graph(chrom, chroms_fw[chrom], fileBedgraphPlus, readNumber)
        to_bed_graph(chrom, chroms_bw[chrom], fileBedgraphMinus, readNumber)
        to_bed_graph(chrom, chroms_fw_new[chrom], fileBedgraphPlusNew, readNumber)
        to_bed_graph(chrom, chroms_bw_new[chrom], fileBedgraphMinusNew, readNumber)

    fileBedgraphPlus.close()
    fileBedgraphPlusNew.close()
    fileBedgraphMinus.close()
    fileBedgraphMinusNew.close()