Python SNPtools примеры использования

Язык программирования: Python

Пространство имен/Пакет: slamdunk.utils

Класс/Тип: SNPtools

Примеров на hotexamples.com: 9

Python SNPtools - 9 примеров найдено. Это лучшие примеры Python кода для slamdunk.utils.SNPtools, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

SNPDictionary(9)

Основные методы

SNPDictionary (9)

Пример #1

Показать файл

Файл: dump.py Проект: selsarrag/slamdunk

def dumpReadInfo(referenceFile,
                 bam,
                 minQual,
                 outputCSV,
                 snpsFile,
                 log,
                 printOnly=False,
                 verbose=True,
                 force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam,
              file=log)
    else:

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        outputFile = SlamSeqWriter(outputCSV)

        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome)
            for read in readIterator:
                outputFile.write(read)

        outputFile.close()

Пример #2

Показать файл

def getConversionRateFromBam(bam, ref, chromosome, start, end, strand):

    testFile = SlamSeqBamFile(bam, ref, SNPtools.SNPDictionary(None))

    sumConversionRate = 0
    readCount = 0
    #for chromosome in testFile.getChromosomes():
    #    readIterator = testFile.readsInChromosome(chromosome)
    #readIterator = testFile.readInRegion("chr7", 3217778, 3221036, "+", 55)
    readIterator = testFile.readInRegion(chromosome, start, end, strand, 100)

    for read in readIterator:
        conversionRate = 0
        if (read.tCount > 0):
            conversionRate = read.tcCount * 1.0 / read.tCount

        #if(read.tcCount > 0):
        sumConversionRate += conversionRate
        readCount += 1

        #if(readCount % 1000 == 0 and readCount > 0):
        #    print(str(readCount) + ": " + str(sumConversionRate) + " / " + str(readCount) + " = " + str(sumConversionRate / readCount))
        #if(readCount >= 10000):
        #    break

    print("Read count: " + str(readCount))
    print("Avg. conversion rate: " + str(sumConversionRate / readCount))

Пример #3

Показать файл

Файл: tcounter.py Проект: selsarrag/slamdunk

def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual,
                             outputBAMPrefix, conversionThreshold, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    samFile = pysam.AlignmentFile(bam, "rb")

    chromosomes = testFile.getChromosomes()

    backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam"
    tcReadFileName = outputBAMPrefix + "_TCReads.bam"

    backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName,
                                             "wb",
                                             template=samFile)
    tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile)

    tcReadDict = dict()

    for chromosome in chromosomes:

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual,
                                                  conversionThreshold)

        for read in readIterator:
            if (read.isTcRead):
                tcReadDict[read.name] = 0

    for read in samFile.fetch():
        if read.query_name in tcReadDict:
            tcReadFile.write(read)
        else:
            backgroundReadFile.write(read)

    backgroundReadFile.close()
    tcReadFile.close()

    pysamIndex(backgroundReadFileName)
    pysamIndex(tcReadFileName)

Пример #4

Показать файл

Файл: tcounter.py Проект: selsarrag/slamdunk

def genomewideConversionRates(referenceFile, snpsFile, bam, minBaseQual,
                              outputBedGraphPrefix, conversionThreshold,
                              coverageCutoff, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    chromosomes = testFile.getChromosomes()

    bedGraphInfo = re.sub("_slamdunk_mapped.*", "",
                          basename(outputBedGraphPrefix))
    print(bedGraphInfo)

    fileBedGraphRatesPlus = open(
        outputBedGraphPrefix + "_TC_rates_genomewide.bedGraph", 'w')
    fileBedGraphRatesMinus = open(
        outputBedGraphPrefix + "_AG_rates_genomewide.bedGraph", 'w')
    fileBedGraphCoveragePlus = open(
        outputBedGraphPrefix + "_coverage_plus_genomewide.bedGraph", 'w')
    fileBedGraphCoverageMinus = open(
        outputBedGraphPrefix + "_coverage_minus_genomewide.bedGraph", 'w')
    fileBedGraphTCConversions = open(
        outputBedGraphPrefix + "_TC_conversions_genomewide.bedGraph", 'w')
    fileBedGraphAGConversions = open(
        outputBedGraphPrefix + "_AG_conversions_genomewide.bedGraph", 'w')
    fileBedGraphT = open(
        outputBedGraphPrefix + "_coverage_T_genomewide.bedGraph", 'w')
    fileBedGraphA = open(
        outputBedGraphPrefix + "_coverage_A_genomewide.bedGraph", 'w')

    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " tc-conversions\" description=\"# T->C conversions / # reads on T per position genome-wide\"",
        file=fileBedGraphRatesPlus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " ag-conversions\" description=\"# A->G conversions / # reads on A per position genome-wide\"",
        file=fileBedGraphRatesMinus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " plus-strand coverage\" description=\"# Reads on plus strand genome-wide\"",
        file=fileBedGraphCoveragePlus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " minus-strand coverage\" description=\"# Reads on minus strand genome-wide\"",
        file=fileBedGraphCoverageMinus)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " T->C conversions\" description=\"# T->C conversions on plus strand genome-wide\"",
        file=fileBedGraphTCConversions)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " A->G conversions\" description=\"# A->G conversions on minus strand genome-wide\"",
        file=fileBedGraphAGConversions)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " T-coverage\" description=\"# Plus-strand reads on Ts genome-wide\"",
        file=fileBedGraphT)
    print(
        "track type=bedGraph name=\"" + bedGraphInfo +
        " A-coverage\" description=\"# Minus-strand reads on As genome-wide\"",
        file=fileBedGraphA)

    for chromosome in chromosomes:

        chrLength = testFile.getChromosomeLength(chromosome)

        tcCount = [0] * chrLength
        agCount = [0] * chrLength

        coveragePlus = [0] * chrLength
        coverageMinus = [0] * chrLength

        tCoverage = [0] * chrLength
        aCoverage = [0] * chrLength

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual,
                                                  conversionThreshold)

        for read in readIterator:
            if (not read.isTcRead):
                read.tcCount = 0
                read.mismatches = []
                read.conversionRates = 0.0
                read.tcRate = 0.0

            for mismatch in read.mismatches:
                if (mismatch.isTCMismatch(
                        read.direction == ReadDirection.Reverse)
                        and mismatch.referencePosition >= 0
                        and mismatch.referencePosition < chrLength):
                    if read.direction == ReadDirection.Reverse:
                        agCount[mismatch.referencePosition] += 1
                    else:
                        tcCount[mismatch.referencePosition] += 1

            for i in xrange(read.startRefPos, read.endRefPos):
                if (i >= 0 and i < chrLength):
                    if read.direction == ReadDirection.Reverse:
                        coverageMinus[i] += 1
                    else:
                        coveragePlus[i] += 1

        prevCoveragePlus = 0
        prevCoveragePlusPos = 0
        prevCoverageMinus = 0
        prevCoverageMinusPos = 0
        prevTCConversionRate = 0
        prevTCConversionRatePos = 0
        prevAGConversionRate = 0
        prevAGConversionRatePos = 0
        prevTCConversions = 0
        prevTCConversionPos = 0
        prevAGConversions = 0
        prevAGConversionPos = 0
        prevTCoverage = 0
        prevTCoveragePos = 0
        prevACoverage = 0
        prevACoveragePos = 0

        for pos in xrange(0, chrLength):
            if prevCoveragePlus != coveragePlus[pos]:
                print(chromosome + "\t" + str(prevCoveragePlusPos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevCoveragePlus),
                      file=fileBedGraphCoveragePlus)
                prevCoveragePlus = coveragePlus[pos]
                prevCoveragePlusPos = pos
            if prevCoverageMinus != coverageMinus[pos]:
                print(chromosome + "\t" + str(prevCoverageMinusPos + 1) +
                      "\t" + str(pos + 1) + "\t" + str(prevCoverageMinus),
                      file=fileBedGraphCoverageMinus)
                prevCoverageMinus = coverageMinus[pos]
                prevCoverageMinusPos = pos

            tCoverage = 0

            if coveragePlus[pos] > 0:
                base = ref.fetch(reference=chromosome,
                                 start=pos + 1,
                                 end=pos + 2)
                if base.upper() == "T":
                    tCoverage = coveragePlus[pos]

            aCoverage = 0

            if coverageMinus[pos] > 0:
                base = ref.fetch(reference=chromosome,
                                 start=pos + 1,
                                 end=pos + 2)
                if base.upper() == "A":
                    aCoverage = coverageMinus[pos]

            if prevTCoverage != tCoverage:
                print(chromosome + "\t" + str(prevTCoveragePos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevTCoverage),
                      file=fileBedGraphT)
                prevTCoverage = tCoverage
                prevTCoveragePos = pos

            if prevACoverage != aCoverage:
                print(chromosome + "\t" + str(prevACoveragePos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevACoverage),
                      file=fileBedGraphA)
                prevACoverage = aCoverage
                prevACoveragePos = pos

            if prevTCConversions != tcCount[pos]:
                print(chromosome + "\t" + str(prevTCConversionPos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevTCConversions),
                      file=fileBedGraphTCConversions)
                prevTCConversions = tcCount[pos]
                prevTCConversionPos = pos

            if prevAGConversions != agCount[pos]:
                print(chromosome + "\t" + str(prevAGConversionPos + 1) + "\t" +
                      str(pos + 1) + "\t" + str(prevAGConversions),
                      file=fileBedGraphAGConversions)
                prevAGConversions = agCount[pos]
                prevAGConversionPos = pos

            TCconversionRate = 0
            if coveragePlus[pos] > 0 and coveragePlus[pos] >= coverageCutoff:
                TCconversionRate = float(tcCount[pos]) / float(
                    coveragePlus[pos])

            AGconversionRate = 0
            if coverageMinus[pos] > 0 and coverageMinus[pos] >= coverageCutoff:
                AGconversionRate = float(agCount[pos]) / float(
                    coverageMinus[pos])

            if prevTCConversionRate != TCconversionRate:
                print(chromosome + "\t" + str(prevTCConversionRatePos + 1) +
                      "\t" + str(pos + 1) + "\t" + str(prevTCConversionRate),
                      file=fileBedGraphRatesPlus)
                prevTCConversionRate = TCconversionRate
                prevTCConversionRatePos = pos

            if prevAGConversionRate != AGconversionRate:
                print(chromosome + "\t" + str(prevAGConversionRatePos + 1) +
                      "\t" + str(pos + 1) + "\t" + str(prevAGConversionRate),
                      file=fileBedGraphRatesMinus)
                prevAGConversionRate = AGconversionRate
                prevAGConversionRatePos = pos

    fileBedGraphRatesPlus.close()
    fileBedGraphRatesMinus.close()
    fileBedGraphCoveragePlus.close()
    fileBedGraphCoverageMinus.close()
    fileBedGraphTCConversions.close()
    fileBedGraphAGConversions.close()
    fileBedGraphT.close()
    fileBedGraphA.close()

Пример #5

Показать файл

Файл: tcounter.py Проект: selsarrag/slamdunk

def computeTconversions(ref,
                        bed,
                        snpsFile,
                        bam,
                        maxReadLength,
                        minQual,
                        outputCSV,
                        outputBedgraphPlus,
                        outputBedgraphMinus,
                        conversionThreshold,
                        log,
                        mle=False):

    referenceFile = pysam.FastaFile(ref)

    sampleInfo = getSampleInfo(bam)

    slamseqInfo = SlamSeqInfo(bam)
    #readNumber = slamseqInfo.MappedReads
    readNumber = slamseqInfo.FilteredReads

    bedMD5 = md5(bed)

    if (mle):
        fileNameTest = replaceExtension(outputCSV, ".tsv", "_perread")
        fileTest = open(fileNameTest, 'w')
        print("#slamdunk v" + __version__,
              __count_version__,
              "sample info:",
              sampleInfo.Name,
              sampleInfo.ID,
              sampleInfo.Type,
              sampleInfo.Time,
              sep="\t",
              file=fileTest)
        print("#annotation:",
              os.path.basename(bed),
              bedMD5,
              sep="\t",
              file=fileTest)
        #print("utr", "n", "k", file=fileTest)
        print(SlamSeqInterval.Header, file=fileTest)

    fileCSV = open(outputCSV, 'w')
    print("#slamdunk v" + __version__,
          __count_version__,
          "sample info:",
          sampleInfo.Name,
          sampleInfo.ID,
          sampleInfo.Type,
          sampleInfo.Time,
          sep="\t",
          file=fileCSV)
    print("#annotation:",
          os.path.basename(bed),
          bedMD5,
          sep="\t",
          file=fileCSV)
    print(SlamSeqInterval.Header, file=fileCSV)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    #Go through one chr after the other
    testFile = SlamSeqBamFile(bam, ref, snps)
    if not testFile.bamVersion == __bam_version__:
        raise RuntimeError("Wrong filtered BAM file version detected (" +
                           testFile.bamVersion + "). Expected version " +
                           __bam_version__ + ". Please rerun slamdunk filter.")

    bedMD5 = md5(bed)
    if slamseqInfo.AnnotationMD5 != bedMD5:
        print(
            "Warning: MD5 checksum of annotation (" + bedMD5 +
            ") does not matched MD5 in filtered BAM files (" +
            slamseqInfo.AnnotationMD5 +
            "). Most probably the annotation filed changed after the filtered BAM files were created.",
            file=log)

    conversionBedGraph = {}

    for utr in BedIterator(bed):
        Tcontent = 0
        slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                     utr.strand, utr.name, Tcontent, 0, 0, 0,
                                     0, 0, 0, 0)
        slamSeqUtrMLE = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                        utr.strand, utr.name, Tcontent, 0, 0,
                                        0, 0, 0, 0, 0)
        if (not utr.hasStrand()):
            raise RuntimeError(
                "Input BED file does not contain stranded intervals.")

        if utr.start < 0:
            raise RuntimeError(
                "Negativ start coordinate found. Please check the following entry in your BED file: "
                + utr)
        # Retreive reference sequence
        region = utr.chromosome + ":" + str(utr.start + 1) + "-" + str(
            utr.stop)

        if (utr.chromosome in list(referenceFile.references)):
            #print(refRegion,file=sys.stderr)
            # pysam-0.15.0.1
            #refSeq = referenceFile.fetch(region=region).upper()
            refSeq = referenceFile.fetch(reference=utr.chromosome,
                                         start=utr.start,
                                         end=utr.stop).upper()
            if (utr.strand == "-"):
                #refSeq = complement(refSeq[::-1])
                Tcontent = refSeq.count("A")
            else:
                Tcontent = refSeq.count("T")

            slamSeqUtr._Tcontent = Tcontent

        readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                             utr.stop, utr.strand,
                                             maxReadLength, minQual,
                                             conversionThreshold)

        tcCountUtr = [0] * utr.getLength()
        coverageUtr = [0] * utr.getLength()

        tInReads = []
        tcInRead = []

        countFwd = 0
        tcCountFwd = 0
        countRev = 0
        tCountRev = 0

        multiMapFwd = 0
        multiMapRev = 0

        for read in readIterator:

            # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
            if (not read.isTcRead):
                read.tcCount = 0
                read.mismatches = []
                read.conversionRates = 0.0
                read.tcRate = 0.0

            if (read.direction == ReadDirection.Reverse):
                countRev += 1
                if read.tcCount > 0:
                    tCountRev += 1
                if read.isMultimapper:
                    multiMapRev += 1
            else:
                countFwd += 1
                if read.tcCount > 0:
                    tcCountFwd += 1
                if read.isMultimapper:
                    multiMapFwd += 1

            for mismatch in read.mismatches:
                if (mismatch.isTCMismatch(
                        read.direction == ReadDirection.Reverse)
                        and mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    tcCountUtr[mismatch.referencePosition] += 1

            testN = read.getTcount()
            testk = 0
            for mismatch in read.mismatches:
                if (mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    if (mismatch.isT(read.direction == ReadDirection.Reverse)):
                        testN += 1
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)):
                        testk += 1
            #print(utr.name, read.name, read.direction, testN, testk, read.sequence, sep="\t")
            tInReads.append(testN)
            tcInRead.append(testk)
            #print(utr.name, testN, testk, sep="\t", file=fileTest)

            for i in xrange(read.startRefPos, read.endRefPos):
                if (i >= 0 and i < utr.getLength()):
                    coverageUtr[i] += 1

        if ((utr.strand == "+" and countFwd > 0)
                or (utr.strand == "-" and countRev > 0)):
            tcRateUtr = [
                x * 100.0 / y if y > 0 else 0
                for x, y in zip(tcCountUtr, coverageUtr)
            ]

            readCount = countFwd
            tcReadCount = tcCountFwd
            multiMapCount = multiMapFwd

            if (utr.strand == "-"):
                readCount = countRev
                tcReadCount = tCountRev
                multiMapCount = multiMapRev

            if ((utr.strand == "-" and countFwd > countRev)
                    or (utr.strand == "+" and countRev > countFwd)):
                print(
                    "Warning: " + utr.name + " is located on the " +
                    utr.strand +
                    " strand but read counts are higher for the opposite strand (fwd: "
                    + countFwd + ", rev: " + countRev + ")",
                    file=sys.stderr)

            refSeq = readIterator.getRefSeq()

            # Get number of covered Ts/As in the UTR and compute average conversion rate for all covered Ts/As
            coveredTcount = 0
            avgConversationRate = 0
            coveredPositions = 0
            # Get number of reads on T positions and number of reads with T->C conversions on T positions
            coverageOnTs = 0
            conversionsOnTs = 0

            for position in xrange(0, len(coverageUtr)):

                if (coverageUtr[position] > 0
                        and ((utr.strand == "+" and refSeq[position] == "T") or
                             (utr.strand == "-" and refSeq[position] == "A"))):
                    coveredTcount += 1
                    avgConversationRate += tcRateUtr[position]

                    coverageOnTs += coverageUtr[position]
                    conversionsOnTs += tcCountUtr[position]
                    conversionBedGraph[utr.chromosome + ":" +
                                       str(utr.start + position) + ":" +
                                       str(utr.strand)] = tcRateUtr[position]
                if (coverageUtr[position] > 0):
                    coveredPositions += 1

            if (coveredTcount > 0):
                avgConversationRate = avgConversationRate / coveredTcount
            else:
                avgConversationRate = 0

            # reads per million mapped to the UTR
            readsCPM = 0
            if (readNumber > 0):
                readsCPM = readCount * 1000000.0 / readNumber

            # Convert to SlamSeqInterval and print
            conversionRate = 0
            if (coverageOnTs > 0):
                conversionRate = float(conversionsOnTs) / float(coverageOnTs)
            slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                         utr.strand, utr.name, Tcontent,
                                         readsCPM, coverageOnTs,
                                         conversionsOnTs, conversionRate,
                                         readCount, tcReadCount, multiMapCount)
            slamSeqUtrMLE = SlamSeqInterval(
                utr.chromosome, utr.start, utr.stop, utr.strand, utr.name,
                Tcontent, readsCPM, coverageOnTs, conversionsOnTs,
                conversionRate, ",".join(str(x) for x in tInReads),
                ",".join(str(x) for x in tcInRead), multiMapCount)

        print(slamSeqUtr, file=fileCSV)
        if (mle):
            print(slamSeqUtrMLE, file=fileTest)

    fileCSV.close()
    if (mle):
        fileTest.close()

    fileBedgraphPlus = open(outputBedgraphPlus, 'w')
    fileBedgraphMinus = open(outputBedgraphMinus, 'w')

    for position in conversionBedGraph:
        positionData = position.split(":")
        if (positionData[2] == "+"):
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphPlus)
        else:
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphMinus)

    fileBedgraphPlus.close()
    fileBedgraphMinus.close()

    if (mle):
        fileNameMLE = replaceExtension(outputCSV, ".tsv", "_mle")
        callR(
            getPlotter("compute_conversion_rate_mle") + " -f " + fileNameTest +
            " -r " + "0.024" + " -o " + fileNameMLE + " &> /dev/null")

Пример #6

Показать файл

def computeSNPMaskedRates(ref,
                          bed,
                          snpsFile,
                          bam,
                          maxReadLength,
                          minQual,
                          coverageCutoff,
                          variantFraction,
                          outputCSV,
                          outputPDF,
                          strictTCs,
                          log,
                          printOnly=False,
                          verbose=True,
                          force=False):

    if (not checkStep([bam, ref], [outputCSV], force)):
        print("Skipped computing T->C per UTR with SNP masking for file " +
              bam,
              file=log)
    else:
        fileCSV = open(outputCSV, 'w')

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, ref, snps)

        progress = 0
        for utr in BedIterator(bed):

            if (not utr.hasStrand()):
                raise RuntimeError(
                    "Input BED file does not contain stranded intervals.")

            if utr.start < 0:
                raise RuntimeError(
                    "Negativ start coordinate found. Please check the following entry in your BED file: "
                    + utr)

            readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                                 utr.stop, utr.strand,
                                                 maxReadLength, minQual)

            unmaskedTCCount = 0
            maskedTCCount = 0
            readCount = 0

            for read in readIterator:

                # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
                if (not read.isTcRead and strictTCs):
                    read.tcCount = 0
                    read.mismatches = []
                    read.conversionRates = 0.0
                    read.tcRate = 0.0

                isTC = False
                isTrueTC = False

                for mismatch in read.mismatches:
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)
                            and mismatch.referencePosition >= 0
                            and mismatch.referencePosition < utr.getLength()):
                        isTrueTC = True

                    unmasked = False
                    if (read.direction == ReadDirection.Reverse
                            and mismatch.referenceBase == "A"
                            and mismatch.readBase == "G"):
                        unmasked = True
                    elif (read.direction != ReadDirection.Reverse
                          and mismatch.referenceBase == "T"
                          and mismatch.readBase == "C"):
                        unmasked = True

                    if (unmasked and mismatch.referencePosition >= 0
                            and mismatch.referencePosition < utr.getLength()):
                        isTC = True

                readCount += 1

                if (isTC):
                    unmaskedTCCount += 1

                if (isTrueTC):
                    maskedTCCount += 1

            containsSNP = 0

            if (unmaskedTCCount != maskedTCCount):
                containsSNP = 1

            print(utr.name + "\t" + str(readCount) + "\t" +
                  str(unmaskedTCCount) + "\t" + str(maskedTCCount) + "\t" +
                  str(containsSNP),
                  file=fileCSV)

            progress += 1

        fileCSV.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("SNPeval") + " -i " + outputCSV + " -c " +
              str(coverageCutoff) + " -v " + str(variantFraction) + " -o " +
              outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)

Пример #7

Показать файл

def tcPerUtr(referenceFile,
             utrBed,
             bam,
             minQual,
             maxReadLength,
             outputCSV,
             outputPDF,
             snpsFile,
             log,
             printOnly=False,
             verbose=True,
             force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per UTR position for file " + bam,
              file=log)
    else:

        counter = 0

        totalUtrCountFwd = [0] * utrNormFactor
        totalUtrCountRev = [0] * utrNormFactor

        tcPerPosRev = [0] * utrNormFactor
        tcPerPosFwd = [0] * utrNormFactor

        allPerPosRev = [0] * utrNormFactor
        allPerPosFwd = [0] * utrNormFactor

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        # Go through one utr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        for utr in BedIterator(utrBed):

            readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                                 utr.stop, utr.strand,
                                                 maxReadLength, minQual)

            tcForwardCounts = [0] * utrNormFactor
            mutForwardCounts = [0] * utrNormFactor
            tcReverseCounts = [0] * utrNormFactor
            mutReverseCounts = [0] * utrNormFactor

            for read in readIterator:

                tcCounts = [0] * utrNormFactor
                mutCounts = [0] * utrNormFactor

                for mismatch in read.mismatches:

                    mismatchPos = mismatch.referencePosition

                    # mismatchPos = read.startRefPos

                    if (utr.strand == "+"):

                        # New try for UTRs (remove + 1
                        if (mismatchPos >= (utr.getLength() - utrNormFactor)
                                and mismatchPos < utr.getLength()):
                            # if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength() + 1) :
                            mismatchPos = utrNormFactor - (utr.getLength() -
                                                           mismatchPos)

                            if (mismatch.isTCMismatch(
                                    read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else:
                                mutCounts[mismatchPos] += 1
                    else:

                        if (mismatchPos >= 0 and mismatchPos < min(
                                utr.getLength(), utrNormFactor)):
                            if (mismatch.isTCMismatch(
                                    read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else:
                                mutCounts[mismatchPos] += 1

                if (read.direction == ReadDirection.Reverse):

                    tcReverseCounts = sumLists(tcReverseCounts, tcCounts)
                    mutReverseCounts = sumLists(mutReverseCounts, mutCounts)

                    start = max(
                        0,
                        min(min(utr.getLength(), utrNormFactor),
                            read.startRefPos))
                    end = max(
                        0,
                        min(min(utr.getLength(), utrNormFactor),
                            read.endRefPos))

                    for i in range(start, end):

                        totalUtrCountRev[i] += 1

                else:

                    tcForwardCounts = sumLists(tcForwardCounts, tcCounts)
                    mutForwardCounts = sumLists(mutForwardCounts, mutCounts)

                    start = min(
                        utr.getLength(),
                        max(utr.getLength() - utrNormFactor, read.startRefPos))
                    end = min(
                        utr.getLength(),
                        max(utr.getLength() - utrNormFactor, read.endRefPos))

                    for i in range(start, end):
                        normPos = utrNormFactor - (utr.getLength() - i)
                        totalUtrCountFwd[normPos] += 1

            tcPerPosFwd = sumLists(tcPerPosFwd, tcForwardCounts)
            allPerPosFwd = sumLists(allPerPosFwd, mutForwardCounts)

            tcPerPosRev = sumLists(tcPerPosRev, tcReverseCounts)
            allPerPosRev = sumLists(allPerPosRev, mutReverseCounts)

            counter += 1

            if (verbose and counter % 10000 == 0):
                print("Handled " + str(counter) + " UTRs.", file=log)

        foTC = open(outputCSV, "w")

        print("# slamdunk tcperutr v" + __version__, file=foTC)

        reverseAllPerPosRev = allPerPosRev[::-1]
        reverseTcPerPosRev = tcPerPosRev[::-1]
        reverseTotalUtrCountRev = totalUtrCountRev[::-1]

        for i in range(0, utrNormFactor):
            print(allPerPosFwd[i],
                  reverseAllPerPosRev[i],
                  tcPerPosFwd[i],
                  reverseTcPerPosRev[i],
                  totalUtrCountFwd[i],
                  reverseTotalUtrCountRev[i],
                  sep='\t',
                  file=foTC)
        foTC.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("conversion_per_read_position") + " -u -i " +
              outputCSV + " -o " + outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)

Пример #8

Показать файл

def tcPerReadPos(referenceFile,
                 bam,
                 minQual,
                 maxReadLength,
                 outputCSV,
                 outputPDF,
                 snpsFile,
                 log,
                 printOnly=False,
                 verbose=True,
                 force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam,
              file=log)
    else:

        totalReadCountFwd = [0] * maxReadLength
        totalReadCountRev = [0] * maxReadLength

        tcPerPosRev = [0] * maxReadLength
        tcPerPosFwd = [0] * maxReadLength

        allPerPosRev = [0] * maxReadLength
        allPerPosFwd = [0] * maxReadLength

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minQual)

            for read in readIterator:

                tcCounts = [0] * maxReadLength
                mutCounts = [0] * maxReadLength

                for mismatch in read.mismatches:
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)):
                        tcCounts[mismatch.readPosition] += 1
                    else:
                        mutCounts[mismatch.readPosition] += 1

                query_length = len(read.sequence)
                if (read.direction == ReadDirection.Reverse):
                    tcPerPosRev = sumLists(tcPerPosRev, tcCounts)
                    allPerPosRev = sumLists(allPerPosRev, mutCounts)

                    for i in range(0, query_length):
                        totalReadCountRev[i] += 1
                else:
                    tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts)
                    allPerPosFwd = sumLists(allPerPosFwd, mutCounts)

                    for i in range(0, query_length):
                        totalReadCountFwd[i] += 1

        foTC = open(outputCSV, "w")

        print("# slamdunk tcperreadpos v" + __version__, file=foTC)

        for i in range(0, maxReadLength):
            print(allPerPosFwd[i],
                  allPerPosRev[i],
                  tcPerPosFwd[i],
                  tcPerPosRev[i],
                  totalReadCountFwd[i],
                  totalReadCountRev[i],
                  sep='\t',
                  file=foTC)
        foTC.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per reads position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV +
              " -o " + outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)

Пример #9

Показать файл

Файл: tcounter.py Проект: leoTiez/slamdunk

def computeTconversionsAll(
        ref,
        snpsFile,
        bam,
        outputBedgraphPlus,
        outputBedgraphPlusNew,
        outputBedgraphMinus,
        outputBedgraphMinusNew,
        conversionThreshold,
        minQual,
        is_inverse,
        log,
):
    def to_bed_graph(c, data, bedgraph, rn):
        data /= rn
        data *= 1000000.0
        [print(c, i, i+1, d, file=bedgraph) for i, d in enumerate(data)]

    chroms_fw = {
        'chrI': np.zeros(230218).astype('float32'),
        'chrII': np.zeros(813184).astype('float32'),
        'chrIII': np.zeros(316620).astype('float32'),
        'chrIV': np.zeros(1531933).astype('float32'),
        'chrIX': np.zeros(439888).astype('float32'),
        'chrM': np.zeros(85779).astype('float32'),
        'chrV': np.zeros(576874).astype('float32'),
        'chrVI': np.zeros(270161).astype('float32'),
        'chrVII': np.zeros(1090940).astype('float32'),
        'chrVIII': np.zeros(562643).astype('float32'),
        'chrX': np.zeros(745751).astype('float32'),
        'chrXI': np.zeros(666816).astype('float32'),
        'chrXII': np.zeros(1078177).astype('float32'),
        'chrXIII': np.zeros(924431).astype('float32'),
        'chrXIV': np.zeros(784333).astype('float32'),
        'chrXV': np.zeros(1091291).astype('float32'),
        'chrXVI': np.zeros(948066).astype('float32')
    }
    chroms_bw = copy.deepcopy(chroms_fw)
    chroms_fw_new = copy.deepcopy(chroms_fw.copy())
    chroms_bw_new = copy.deepcopy(chroms_fw.copy())
    readNumber, positiveCount, negativeCount, positiveCountNew, negativeCountNew = 0, 0, 0, 0, 0
    bamFile = pysam.AlignmentFile(bam, "rb")
    if bamFile.header['HD']['SO'] != 'queryname':
        # Sort bam file
        sbam = replaceExtension(bam, '.bam', '_sorted')
        if not os.path.exists(sbam):
            run(
                'samtools sort -n %s -o %s' % (bam, sbam),
                log
            )
    else:
        sbam = bam

    bamFile = pysam.AlignmentFile(sbam, "rb")
    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    seqIter = SlamSeqIter(bamFile, ref, snps, conversionThreshold, minQual)
    read1 = None
    read2 = None
    for read in seqIter:
        if not read.isPaired or read.unmappedMate or read.duplicate:
            continue
        if read.isSecondRead:
            read2 = read
        else:
            read1 = read
            read2 = None
            continue
        if read1 is None or read2 is None or read1.queryName != read2.queryName:
            continue
        readNumber += 1
        chrom = read1.chromosome
        start = np.minimum(read1.startRefPos, read2.startRefPos)
        end = np.maximum(read2.endRefPos, read2.endRefPos)
        is_tc_read = read1.isTcRead or read2.isTcRead
        direction_read = read1 if not is_inverse else read2
        if direction_read.direction == ReadDirection.Forward:
            positiveCount += 1
            chroms_fw[chrom][start:end] += 1
            if is_tc_read:
                positiveCountNew += 1
                chroms_fw_new[chrom][start:end] += 1
        else:
            negativeCount += 1
            chroms_bw[chrom][start:end] += 1
            if is_tc_read:
                negativeCountNew += 1
                chroms_bw_new[chrom][start:end] += 1

    print("Total reads: %s\n"
          "Positive reads: %s\n"
          "Positive reads new: %s\n"
          "Negative reads: %s\n"
          "Negative reads new: %s" %
          (readNumber, positiveCount, positiveCountNew, negativeCount, negativeCountNew),
          file=log)
    fileBedgraphPlus = open(outputBedgraphPlus, 'w')
    fileBedgraphPlusNew = open(outputBedgraphPlusNew, 'w')
    fileBedgraphMinus = open(outputBedgraphMinus, 'w')
    fileBedgraphMinusNew = open(outputBedgraphMinusNew, 'w')
    for chrom in chroms_fw.keys():
        to_bed_graph(chrom, chroms_fw[chrom], fileBedgraphPlus, readNumber)
        to_bed_graph(chrom, chroms_bw[chrom], fileBedgraphMinus, readNumber)
        to_bed_graph(chrom, chroms_fw_new[chrom], fileBedgraphPlusNew, readNumber)
        to_bed_graph(chrom, chroms_bw_new[chrom], fileBedgraphMinusNew, readNumber)

    fileBedgraphPlus.close()
    fileBedgraphPlusNew.close()
    fileBedgraphMinus.close()
    fileBedgraphMinusNew.close()