Пример #1
0
def statsComputeOverallRates(referenceFile,
                             bam,
                             minBaseQual,
                             outputCSV,
                             outputPDF,
                             log,
                             printOnly=False,
                             verbose=True,
                             force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:
        # Init
        totalRatesFwd = [0] * 25
        totalRatesRev = [0] * 25
        tcCount = [0] * 100

        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minBaseQual)

            for read in readIterator:

                # Compute rates for current read
                rates = read.conversionRates
                # Get T -> C conversions for current read
                tc = read.tcCount
                tcCount[tc] += 1

                # Add rates from read to total rates
                if (read.direction == ReadDirection.Reverse):
                    totalRatesRev = sumLists(totalRatesRev, rates)
                else:
                    totalRatesFwd = sumLists(totalRatesFwd, rates)

        # Print rates in correct format for plotting
        fo = open(outputCSV, "w")
        print("# slamdunk rates v" + __version__, file=fo)
        printRates(totalRatesFwd, totalRatesRev, fo)
        fo.close()

    if (not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing overall rate pdfs for file " + bam, file=log)
    else:

        #f = tempfile.NamedTemporaryFile(delete=False)
        #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f)
        #f.close()

        callR(getPlotter("compute_overall_rates") + " -f " + outputCSV +
              " -n " + removeExtension(os.path.basename(bam)) + " -O " +
              outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #2
0
def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2" , outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False):

    if(quantseqMapping is True) :
        parameter = "--no-progress"
            
    if(trim5p > 0):
        parameter = parameter + " -5 " + str(trim5p)
    
    if(maxPolyA > -1):
        parameter = parameter + " --max-polya " + str(maxPolyA)
    
    if(endtoendMapping is True):
        parameter = parameter + " -e "
    else:
        parameter = parameter + " -l "

    if(sampleId != None):    
        parameter = parameter + " --rg-id " + str(sampleId)
        if(sampleName != ""):
            parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(sampleTime)
    
    if(topn > 1):
        parameter = parameter + " -n " + str(topn) + " --strata "
        
    if(checkStep([inputReference, inputBAM], [replaceExtension(outputSAM, ".bam")], force)):
        if outputSAM.endswith(".sam"):
            # Output SAM
            run(getBinary("ngm") + " -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)
        else:
            # Output BAM directly
            run(getBinary("ngm") + " -b -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)        
    else:
        print("Skipped mapping for " + inputBAM, file=log)
Пример #3
0
def dumpReadInfo(referenceFile,
                 bam,
                 minQual,
                 outputCSV,
                 snpsFile,
                 log,
                 printOnly=False,
                 verbose=True,
                 force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam,
              file=log)
    else:

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        outputFile = SlamSeqWriter(outputCSV)

        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome)
            for read in readIterator:
                outputFile.write(read)

        outputFile.close()
Пример #4
0
def SNPs(
        inputBAM,
        outputSNP,
        referenceFile,
        minVarFreq,
        minCov,
        minQual,
        log,
        printOnly=False,
        verbose=True,
        force=False
):
    files = [os.path.expanduser(p) for p in [inputBAM, referenceFile]]
    if checkStep(files, [os.path.expanduser(outputSNP)], force):
        fileSNP = open(outputSNP, "w")

        mpileupCmd = "samtools mpileup -B -A -f %s %s" % (referenceFile, inputBAM)
        if verbose:
            print(mpileupCmd, file=log)
        if not printOnly:
            mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log)

        varscanCmd = "varscan mpileup2snp --strand-filter 0 --output-vcf " \
                     "--min-var-freq %s --min-coverage %s --variants 1" % (str(minVarFreq), str(minCov),)
        if verbose:
            print(varscanCmd, file=log)
        if not printOnly:
            varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log)
            varscan.wait()

        fileSNP.close()
    else:
        print("Skipping SNP calling", file=log)
Пример #5
0
def statsComputeOverallRates(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False):
     
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:
        # Init
        totalRatesFwd = [0] * 25
        totalRatesRev = [0] * 25
        tcCount = [0] * 100
         
        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)
         
        chromosomes = testFile.getChromosomes()
         
        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minBaseQual)
                 
            for read in readIterator:
                 
                # Compute rates for current read
                rates = read.conversionRates
                # Get T -> C conversions for current read
                tc = read.tcCount
                tcCount[tc] += 1
                 
                # Add rates from read to total rates
                if(read.direction == ReadDirection.Reverse):
                    totalRatesRev = sumLists(totalRatesRev, rates)
                else:
                    totalRatesFwd = sumLists(totalRatesFwd, rates)
              
        # Print rates in correct format for plotting
        fo = open(outputCSV, "w")
        print("# slamdunk rates v" + __version__, file=fo)
        printRates(totalRatesFwd, totalRatesRev, fo)
        fo.close()
     
    if(not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing overall rate pdfs for file " + bam, file=log)
    else:

        #f = tempfile.NamedTemporaryFile(delete=False)
        #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f)
        #f.close()
             
        callR(getPlotter("compute_overall_rates") + " -f " + outputCSV + " -n " + removeExtension(os.path.basename(bam)) + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
Пример #6
0
def Map(inputBAM,
        inputReference,
        outputSAM,
        log,
        quantseqMapping,
        endtoendMapping,
        threads=1,
        parameter="--no-progress --slam-seq 2",
        outputSuffix="_ngm_slamdunk",
        trim5p=0,
        maxPolyA=-1,
        topn=1,
        sampleId=None,
        sampleName="NA",
        sampleType="NA",
        sampleTime=0,
        printOnly=False,
        verbose=True,
        force=False,
        isPaired=False):
    if quantseqMapping:
        parameter = "--no-progress"

    if trim5p > 0:
        parameter = parameter + " -5 " + str(trim5p)

    if maxPolyA > -1:
        parameter = parameter + " --max-polya " + str(maxPolyA)

    if endtoendMapping:
        parameter = parameter + " -e "
    else:
        parameter = parameter + " -l "

    if sampleId is not None:
        parameter = parameter + " --rg-id " + str(sampleId)
        if sampleName != "":
            parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(
                sampleTime)

    if topn > 1:
        parameter = parameter + " -n " + str(topn) + " --strata "

    files = [inputReference]
    files.append(inputBAM) if not isPaired else files.extend(inputBAM)
    files = [os.path.expanduser(p) for p in files]
    if checkStep(files, [replaceExtension(outputSAM, ".bam")], force):
        cmd = "ngm %s -r %s %s -t %s %s -o %s" % (
            "" if outputSAM.endswith(".sam") else "-b", files[0],
            "-q %s" % files[1] if not isPaired else "-1 %s -2 %s" %
            (files[1], files[2]), threads, parameter, outputSAM)
        run(cmd, log, verbose=verbose, dry=printOnly)
    else:
        print("Skipped mapping for " +
              inputBAM if not isPaired else inputBAM[0],
              file=log)
Пример #7
0
def SNPs(inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual, log, printOnly=False, verbose=True, force=False):
    if(checkStep([inputBAM, referenceFile], [outputSNP], force)):
        fileSNP = open(outputSNP, 'w')
        
        mpileupCmd = getBinary("samtools") + " mpileup -B -A -f " + referenceFile + " " + inputBAM
        if(verbose):
            print(mpileupCmd, file=log)
        if(not printOnly):
            mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log)
            
        varscanCmd = "java -jar " + getBinary("VarScan.v2.4.1.jar") + " mpileup2snp  --strand-filter 0 --output-vcf --min-var-freq " + str(minVarFreq) + " --min-coverage " + str(minCov) + " --variants 1"
        if(verbose):
            print(varscanCmd, file=log)
        if(not printOnly):
            varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log)
            varscan.wait()
        
        fileSNP.close()
    else:
        print("Skipping SNP calling", file=log)    
Пример #8
0
def sort(inputSAM,
         outputBAM,
         log,
         threads=1,
         keepSam=True,
         dry=False,
         verbose=True):

    if (files_exist(inputSAM) and checkStep([inputSAM], [outputBAM + ".bai"])):
        runSam2bam(inputSAM,
                   outputBAM,
                   log,
                   False,
                   False,
                   not keepSam,
                   threads=threads,
                   dry=dry,
                   verbose=verbose)
    else:
        print("Skipped sorting for " + inputSAM, file=log)
Пример #9
0
def SNPs(inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual, log, printOnly=False, verbose=True, force=False):
    if(checkStep([inputBAM, referenceFile], [outputSNP], force)):
        fileSNP = open(outputSNP, 'w')
        
        mpileupCmd = getBinary("samtools") + " mpileup -B -A -f " + referenceFile + " " + inputBAM
        if(verbose):
            print(mpileupCmd, file=log)
        if(not printOnly):
            mpileup = subprocess.Popen(mpileupCmd, shell=True, stdout=subprocess.PIPE, stderr=log)
            
        varscanCmd = "java -jar " + getBinary("VarScan.v2.4.1.jar") + " mpileup2snp  --strand-filter 0 --output-vcf --min-var-freq " + str(minVarFreq) + " --min-coverage " + str(minCov) + " --variants 1"
        if(verbose):
            print(varscanCmd, file=log)
        if(not printOnly):
            varscan = subprocess.Popen(varscanCmd, shell=True, stdin=mpileup.stdout, stdout=fileSNP, stderr=log)
            varscan.wait()
        
        fileSNP.close()
    else:
        print("Skipping SNP calling", file=log)    
Пример #10
0
def sort(inputSAM,
         outputBAM,
         log,
         threads=1,
         keepSam=True,
         dry=False,
         verbose=True,
         isPaired=False):
    if files_exist(inputSAM) and checkStep([inputSAM], [outputBAM + ".bai"]):
        runSam2bam(inputSAM,
                   outputBAM,
                   log,
                   index=False,
                   sort="name" if isPaired else None,
                   delinFile=not keepSam,
                   onlyProperPaired=True,
                   threads=threads,
                   dry=dry,
                   verbose=verbose)
    else:
        print("Skipped sorting for " + inputSAM, file=log)
Пример #11
0
def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False):
    
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam, file=log)
    else:
                
        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()
    
        outputFile = SlamSeqWriter(outputCSV)
        
        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)
        
        chromosomes = testFile.getChromosomes()
        
        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome)
            for read in readIterator:
                outputFile.write(read)

        
        outputFile.close()
Пример #12
0
def Dedup(inputBAM,
          outputBAM,
          tcMutations,
          log,
          printOnly=False,
          verbose=True,
          force=False):

    if (printOnly or checkStep([inputBAM], [outputBAM], force)):

        samfile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile)

        processedReads = 0
        retainedReads = 0

        prevChr = ""
        prevStart = ""

        duplicateBuffer = {}

        for read in samfile:

            flag = read.cigarstring
            chr = read.reference_id
            start = read.reference_start
            seq = read.query_sequence
            if (read.has_tag("TC")):
                tcflag = read.get_tag("TC")
            else:
                tcflag = 0

            if (tcflag >= tcMutations):

                if (chr != prevChr or start != prevStart):

                    if (prevChr != ""):
                        for curSeq in duplicateBuffer:
                            for curFlag in duplicateBuffer[curSeq]:
                                for readEntry in duplicateBuffer[curSeq][
                                        curFlag]:
                                    if not readEntry.is_duplicate:
                                        retainedReads += 1
                                    outfile.write(readEntry)
                        duplicateBuffer.clear()

                if not seq in duplicateBuffer:
                    duplicateBuffer[seq] = {}
                if not flag in duplicateBuffer[seq]:
                    duplicateBuffer[seq][flag] = list()
                if len(duplicateBuffer[seq][flag]) > 0:
                    read.is_duplicate = True
                duplicateBuffer[seq][flag].append(read)

                prevChr = chr
                prevStart = start

                processedReads += 1

        for seq in duplicateBuffer:
            for flag in duplicateBuffer[seq]:
                for readEntry in duplicateBuffer[seq][flag]:
                    if not readEntry.is_duplicate:
                        retainedReads += 1
                    outfile.write(readEntry)
        duplicateBuffer.clear()

        outfile.close()

        print("Retained " + str(retainedReads) + " of " + str(processedReads) +
              " reads (",
              file=log,
              end="")
        print("{0:.2f}".format(float(retainedReads) / float(processedReads)),
              file=log,
              end="")
        print(" compression rate)", file=log)

        pysamIndex(outputBAM)

    else:
        print("Skipped deduplication for " + inputBAM, file=log)
Пример #13
0
def computeSNPMaskedRates(ref,
                          bed,
                          snpsFile,
                          bam,
                          maxReadLength,
                          minQual,
                          coverageCutoff,
                          variantFraction,
                          outputCSV,
                          outputPDF,
                          strictTCs,
                          log,
                          printOnly=False,
                          verbose=True,
                          force=False):

    if (not checkStep([bam, ref], [outputCSV], force)):
        print("Skipped computing T->C per UTR with SNP masking for file " +
              bam,
              file=log)
    else:
        fileCSV = open(outputCSV, 'w')

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, ref, snps)

        progress = 0
        for utr in BedIterator(bed):

            if (not utr.hasStrand()):
                raise RuntimeError(
                    "Input BED file does not contain stranded intervals.")

            if utr.start < 0:
                raise RuntimeError(
                    "Negativ start coordinate found. Please check the following entry in your BED file: "
                    + utr)

            readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                                 utr.stop, utr.strand,
                                                 maxReadLength, minQual)

            unmaskedTCCount = 0
            maskedTCCount = 0
            readCount = 0

            for read in readIterator:

                # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
                if (not read.isTcRead and strictTCs):
                    read.tcCount = 0
                    read.mismatches = []
                    read.conversionRates = 0.0
                    read.tcRate = 0.0

                isTC = False
                isTrueTC = False

                for mismatch in read.mismatches:
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)
                            and mismatch.referencePosition >= 0
                            and mismatch.referencePosition < utr.getLength()):
                        isTrueTC = True

                    unmasked = False
                    if (read.direction == ReadDirection.Reverse
                            and mismatch.referenceBase == "A"
                            and mismatch.readBase == "G"):
                        unmasked = True
                    elif (read.direction != ReadDirection.Reverse
                          and mismatch.referenceBase == "T"
                          and mismatch.readBase == "C"):
                        unmasked = True

                    if (unmasked and mismatch.referencePosition >= 0
                            and mismatch.referencePosition < utr.getLength()):
                        isTC = True

                readCount += 1

                if (isTC):
                    unmaskedTCCount += 1

                if (isTrueTC):
                    maskedTCCount += 1

            containsSNP = 0

            if (unmaskedTCCount != maskedTCCount):
                containsSNP = 1

            print(utr.name + "\t" + str(readCount) + "\t" +
                  str(unmaskedTCCount) + "\t" + str(maskedTCCount) + "\t" +
                  str(containsSNP),
                  file=fileCSV)

            progress += 1

        fileCSV.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("SNPeval") + " -i " + outputCSV + " -c " +
              str(coverageCutoff) + " -v " + str(variantFraction) + " -o " +
              outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #14
0
def Filter(inputBAM,
           outputBAM,
           log,
           bed,
           MQ=2,
           minIdentity=0.8,
           NM=-1,
           printOnly=False,
           verbose=True,
           force=False,
           paired=False):
    inputBAM = os.path.expanduser(inputBAM)
    outputBAM = os.path.expanduser(outputBAM)
    if printOnly or checkStep([inputBAM], [outputBAM], force):
        (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered,
         nmFiltered, multimapper) = 0, 0, 0, 0, 0, 0, 0

        infile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)
        # Default filtering without bed
        if bed is None:
            print("#No bed-file supplied. Running default filtering on " +
                  inputBAM + ".",
                  file=log)
            if paired:
                read1 = None
                read2 = None
            for read in infile:
                if paired:
                    if not read.is_paired or read.mate_is_unmapped or read.is_duplicate:
                        unmappedReads += 1
                        continue
                    if read.is_read2:
                        read2 = read
                    else:
                        read1 = read
                        read2 = None
                        continue

                if not read.is_secondary and not read.is_supplementary:
                    if read.is_unmapped:
                        unmappedReads += 1
                        continue
                    else:
                        mappedReads += 1

                if not paired:
                    if read.mapping_quality < MQ:
                        mqFiltered += 1
                        continue
                    if float(read.get_tag("XI")) < minIdentity:
                        idFiltered += 1
                        continue
                    if -1 < NM < int(read.get_tag("NM")):
                        nmFiltered += 1
                        continue

                    filteredReads += 1
                    outfile.write(read)
                else:
                    if read1 is None or read2 is None:
                        continue
                    if read1.query_name != read2.query_name:
                        continue

                    if read1.mapping_quality < MQ and read2.mapping_quality < MQ:
                        mqFiltered += 1
                        continue
                    if float(read1.get_tag("XI")) < minIdentity and float(
                            read2.get_tag("XI")) < minIdentity:
                        idFiltered += 1
                        continue
                    if -1 < NM < int(read1.get_tag("NM")) and -1 < NM < int(
                            read2.get_tag("NM")):
                        nmFiltered += 1
                        continue
                    filteredReads += 1
                    outfile.write(read1)
                    outfile.write(read2)

            print("Criterion\tFiltered reads", file=log)
            print("MQ < 0\t0", file=log)
            print("ID < %s\t%s" % (minIdentity, idFiltered), file=log)
            print("NM > %s\t%s" % (NM, nmFiltered), file=log)
            print("MM\t0", file=log)
        else:
            # Multimap retention strategy filtering when bed is supplied
            print(
                "#Bed-file supplied. Running multimap retention filtering strategy on "
                + inputBAM + ".",
                file=log)
            (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered,
             nmFiltered,
             multimapper) = multimapUTRRetainment(infile, outfile, bed,
                                                  minIdentity, NM, MQ, log)

        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if "RG" in inFileBamHeader and len(inFileBamHeader["RG"]) > 0:
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if bed:
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else:
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""

            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader["RG"][0]["DS"] = str(slamseqInfo)

        slamDunkPG = {
            "ID": "slamdunk",
            "PN": "slamdunk filter v" + __version__,
            "VN": __bam_version__
        }
        if "PG" in inFileBamHeader:
            inFileBamHeader["PG"].append(slamDunkPG)
        else:
            inFileBamHeader["PG"] = [slamDunkPG]

        infile.close()
        outfile.close()

        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, paired=False, verbose=verbose)
        if not paired:
            pysamIndex(outputBAM)
    else:
        print("Skipped filtering for " + inputBAM, file=log)
Пример #15
0
def tcPerUtr(referenceFile,
             utrBed,
             bam,
             minQual,
             maxReadLength,
             outputCSV,
             outputPDF,
             snpsFile,
             log,
             printOnly=False,
             verbose=True,
             force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per UTR position for file " + bam,
              file=log)
    else:

        counter = 0

        totalUtrCountFwd = [0] * utrNormFactor
        totalUtrCountRev = [0] * utrNormFactor

        tcPerPosRev = [0] * utrNormFactor
        tcPerPosFwd = [0] * utrNormFactor

        allPerPosRev = [0] * utrNormFactor
        allPerPosFwd = [0] * utrNormFactor

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        # Go through one utr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        for utr in BedIterator(utrBed):

            readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                                 utr.stop, utr.strand,
                                                 maxReadLength, minQual)

            tcForwardCounts = [0] * utrNormFactor
            mutForwardCounts = [0] * utrNormFactor
            tcReverseCounts = [0] * utrNormFactor
            mutReverseCounts = [0] * utrNormFactor

            for read in readIterator:

                tcCounts = [0] * utrNormFactor
                mutCounts = [0] * utrNormFactor

                for mismatch in read.mismatches:

                    mismatchPos = mismatch.referencePosition

                    # mismatchPos = read.startRefPos

                    if (utr.strand == "+"):

                        # New try for UTRs (remove + 1
                        if (mismatchPos >= (utr.getLength() - utrNormFactor)
                                and mismatchPos < utr.getLength()):
                            # if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength() + 1) :
                            mismatchPos = utrNormFactor - (utr.getLength() -
                                                           mismatchPos)

                            if (mismatch.isTCMismatch(
                                    read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else:
                                mutCounts[mismatchPos] += 1
                    else:

                        if (mismatchPos >= 0 and mismatchPos < min(
                                utr.getLength(), utrNormFactor)):
                            if (mismatch.isTCMismatch(
                                    read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else:
                                mutCounts[mismatchPos] += 1

                if (read.direction == ReadDirection.Reverse):

                    tcReverseCounts = sumLists(tcReverseCounts, tcCounts)
                    mutReverseCounts = sumLists(mutReverseCounts, mutCounts)

                    start = max(
                        0,
                        min(min(utr.getLength(), utrNormFactor),
                            read.startRefPos))
                    end = max(
                        0,
                        min(min(utr.getLength(), utrNormFactor),
                            read.endRefPos))

                    for i in range(start, end):

                        totalUtrCountRev[i] += 1

                else:

                    tcForwardCounts = sumLists(tcForwardCounts, tcCounts)
                    mutForwardCounts = sumLists(mutForwardCounts, mutCounts)

                    start = min(
                        utr.getLength(),
                        max(utr.getLength() - utrNormFactor, read.startRefPos))
                    end = min(
                        utr.getLength(),
                        max(utr.getLength() - utrNormFactor, read.endRefPos))

                    for i in range(start, end):
                        normPos = utrNormFactor - (utr.getLength() - i)
                        totalUtrCountFwd[normPos] += 1

            tcPerPosFwd = sumLists(tcPerPosFwd, tcForwardCounts)
            allPerPosFwd = sumLists(allPerPosFwd, mutForwardCounts)

            tcPerPosRev = sumLists(tcPerPosRev, tcReverseCounts)
            allPerPosRev = sumLists(allPerPosRev, mutReverseCounts)

            counter += 1

            if (verbose and counter % 10000 == 0):
                print("Handled " + str(counter) + " UTRs.", file=log)

        foTC = open(outputCSV, "w")

        print("# slamdunk tcperutr v" + __version__, file=foTC)

        reverseAllPerPosRev = allPerPosRev[::-1]
        reverseTcPerPosRev = tcPerPosRev[::-1]
        reverseTotalUtrCountRev = totalUtrCountRev[::-1]

        for i in range(0, utrNormFactor):
            print(allPerPosFwd[i],
                  reverseAllPerPosRev[i],
                  tcPerPosFwd[i],
                  reverseTcPerPosRev[i],
                  totalUtrCountFwd[i],
                  reverseTotalUtrCountRev[i],
                  sep='\t',
                  file=foTC)
        foTC.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("conversion_per_read_position") + " -u -i " +
              outputCSV + " -o " + outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #16
0
def tcPerReadPos(referenceFile,
                 bam,
                 minQual,
                 maxReadLength,
                 outputCSV,
                 outputPDF,
                 snpsFile,
                 log,
                 printOnly=False,
                 verbose=True,
                 force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam,
              file=log)
    else:

        totalReadCountFwd = [0] * maxReadLength
        totalReadCountRev = [0] * maxReadLength

        tcPerPosRev = [0] * maxReadLength
        tcPerPosFwd = [0] * maxReadLength

        allPerPosRev = [0] * maxReadLength
        allPerPosFwd = [0] * maxReadLength

        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()

        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minQual)

            for read in readIterator:

                tcCounts = [0] * maxReadLength
                mutCounts = [0] * maxReadLength

                for mismatch in read.mismatches:
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)):
                        tcCounts[mismatch.readPosition] += 1
                    else:
                        mutCounts[mismatch.readPosition] += 1

                query_length = len(read.sequence)
                if (read.direction == ReadDirection.Reverse):
                    tcPerPosRev = sumLists(tcPerPosRev, tcCounts)
                    allPerPosRev = sumLists(allPerPosRev, mutCounts)

                    for i in range(0, query_length):
                        totalReadCountRev[i] += 1
                else:
                    tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts)
                    allPerPosFwd = sumLists(allPerPosFwd, mutCounts)

                    for i in range(0, query_length):
                        totalReadCountFwd[i] += 1

        foTC = open(outputCSV, "w")

        print("# slamdunk tcperreadpos v" + __version__, file=foTC)

        for i in range(0, maxReadLength):
            print(allPerPosFwd[i],
                  allPerPosRev[i],
                  tcPerPosFwd[i],
                  tcPerPosRev[i],
                  totalReadCountFwd[i],
                  totalReadCountRev[i],
                  sep='\t',
                  file=foTC)
        foTC.close()

    if (not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per reads position plot for file " + bam,
              file=log)
    else:
        callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV +
              " -o " + outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #17
0
def statsComputeOverallRatesPerUTR(referenceFile, bam, minBaseQual, strictTCs, outputCSV, outputPDF, utrBed, maxReadLength, log, printOnly=False, verbose=True, force=False):
    
    sampleInfo = getSampleInfo(bam)
    
    slamseqInfo = SlamSeqInfo(bam)
    
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:
    
        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)
        
        # UTR stats for MultiQC
        utrStats = dict()
        
        plotConversions = ['A>T', 'A>G', 'A>C',
                           'C>A', 'C>G', 'C>T',
                           'G>A', 'G>C', 'G>T',
                           'T>A', 'T>G', 'T>C',
        ]
        
        for conversion in plotConversions:
            utrStats[conversion] = list()
            
        f = tempfile.NamedTemporaryFile(delete=False)
                        
        for utr in BedIterator(utrBed):
                                         
            readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minBaseQual)
            
            # Init
            totalRates = [0] * 25
            
            readCount = 0
            for read in readIterator:
                
                if (not read.isTcRead and strictTCs and read.tcCount > 0) :
                    pass
                else :
                
                    # Compute rates for current read
                    rates = read.conversionRates
                
                    # Add rates from read to total rates
                    totalRates = sumLists(totalRates, rates)
                    readCount += 1
                    
            print(utr.name, utr.chromosome, utr.start, utr.stop, utr.strand, readCount, "\t".join(str(x) for x in totalRates), sep="\t", file=f)
            
            # Process rates for MultiQC
            # Copied directly, too lazy to do it properly now
            
            utrDict = {}
            
            conversionSum = 0
            
            A_A = totalRates[0]
            conversionSum =+ A_A
            A_C = totalRates[1]
            conversionSum =+ A_C
            A_G = totalRates[2]
            conversionSum =+ A_G
            A_T = totalRates[3]
            conversionSum =+ A_T
            
            C_A = totalRates[5]
            conversionSum =+ C_A
            C_C = totalRates[6]
            conversionSum =+ C_C
            C_G = totalRates[7]
            conversionSum =+ C_G
            C_T = totalRates[8]
            conversionSum =+ C_T
            
            G_A = totalRates[10]
            conversionSum =+ G_A
            G_C = totalRates[11]
            conversionSum =+ G_C
            G_G = totalRates[12]
            conversionSum =+ G_G
            G_T = totalRates[13]
            conversionSum =+ G_T
            
            T_A = totalRates[15]
            conversionSum =+ T_A
            T_C = totalRates[16]
            conversionSum =+ T_C
            T_G = totalRates[17]
            conversionSum =+ T_G
            T_T = totalRates[18]
            conversionSum =+ T_T
            
            if utr.strand == "-":
                    
                A_A, T_T = T_T,A_A
                G_G, C_C = C_C,G_G
                A_C, T_G = T_G, A_C
                A_G, T_C = T_C, A_G
                A_T, T_A = T_A, A_T
                C_A, G_T = G_T, C_A
                C_G, G_C = G_C, C_G
                C_T, G_A = G_A, C_T
            
            if conversionSum > 0:
                        
                Asum = A_A + A_C + A_G + A_T
                Csum = C_A + C_C + C_G + C_T
                Gsum = G_A + G_C + G_G + G_T
                Tsum = T_A + T_C + T_G + T_T
                 
                if Asum > 0 :
                    A_T = A_T / float(Asum) * 100
                    A_G = A_G / float(Asum) * 100
                    A_C = A_C / float(Asum) * 100
                else :
                    A_T = 0
                    A_G = 0
                    A_C = 0
                if Csum > 0:
                    C_A = C_A / float(Csum) * 100
                    C_G = C_G / float(Csum) * 100
                    C_T = C_T / float(Csum) * 100
                else :
                    C_A = 0
                    C_G = 0
                    C_T = 0
                if Gsum > 0:
                    G_A = G_A / float(Gsum) * 100
                    G_C = G_C / float(Gsum) * 100
                    G_T = G_T / float(Gsum) * 100
                else :
                    G_A = 0
                    G_C = 0
                    G_T = 0
                if Tsum > 0:
                    T_A = T_A / float(Tsum) * 100
                    T_G = T_G / float(Tsum) * 100
                    T_C = T_C / float(Tsum) * 100
                else :
                    T_A = 0
                    T_G = 0
                    T_C = 0
                   
                utrStats['A>T'].append(A_T)
                utrStats['A>G'].append(A_G)
                utrStats['A>C'].append(A_C)
                
                utrStats['C>A'].append(C_A)
                utrStats['C>G'].append(C_G)
                utrStats['C>T'].append(C_T)
                
                utrStats['G>A'].append(G_A)
                utrStats['G>T'].append(G_T)
                utrStats['G>C'].append(G_C)
                
                utrStats['T>A'].append(T_A)
                utrStats['T>G'].append(T_G)
                utrStats['T>C'].append(T_C)        
                
        f.close()
        
        fo = open(outputCSV, "w")
        
        print("# slamdunk utrrates v" + __version__, file=fo)
        
        print("# Median-Conversions=",end="",file=fo)
        
        first = True
        for conversion in plotConversions:
            if (not first) :
                print(',',file=fo, end="")
            else :
                first = False
            print(conversion + ":" + str(np.median(utrStats[conversion])),file=fo, end="")
        print(file=fo) 
        
        print("Name", "Chr", "Start", "End", "Strand", "ReadCount", sep="\t", end="\t", file=fo)
        for i in range(0, 5):
            for j in range(0, 5):
                print(toBase[i].upper() + "_" + toBase[j].upper(), end="", file=fo)
                if(i != 4 or j != 4):
                    print("\t", end="", file=fo)
        print(file=fo)
        
        with open(f.name, "rb") as valueFile:
            fo.write(valueFile.read())
        
        fo.close()
                
    if(not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing global rate pdfs for file " + bam, file=log)
    else:
        f = tempfile.NamedTemporaryFile(delete=False)
        print(sampleInfo.Name, outputCSV, sep='\t', file=f)
        f.close()
              
        callR(getPlotter("globalRatePlotter") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
Пример #18
0
def statsComputeTCContext(referenceFile,
                          bam,
                          minBaseQual,
                          outputCSV,
                          outputPDF,
                          log,
                          printOnly=False,
                          verbose=True,
                          force=False):

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:
        # Init
        # combinations = ["AT","CT","GT","TT","NT","AA","CA","GA","TA","NA"]
        frontCombinations = ["AT", "CT", "GT", "TT", "NT"]
        backCombinations = ["TA", "TC", "TG", "TT", "TN"]

        counts = {}
        counts['5prime'] = {}
        counts['3prime'] = {}
        counts['5prime']['fwd'] = {}
        counts['5prime']['rev'] = {}
        counts['3prime']['fwd'] = {}
        counts['3prime']['rev'] = {}

        for combination in frontCombinations:
            counts['5prime']['fwd'][combination] = 0
            counts['5prime']['rev'][combination] = 0

        for combination in backCombinations:
            counts['3prime']['fwd'][combination] = 0
            counts['3prime']['rev'][combination] = 0

        bamFile = pysam.AlignmentFile(bam, "rb")

        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)

        chromosomes = testFile.getChromosomes()

        for chromosome in chromosomes:

            for read in bamFile.fetch(region=chromosome):

                i = 0
                while i < len(read.query_sequence):
                    if (read.query_sequence[i] == "T" and not read.is_reverse):
                        frontContext = None
                        backContext = None
                        if (i > 0):
                            frontContext = read.query_sequence[i - 1]
                        if (i < (len(read.query_sequence) - 1)):
                            backContext = read.query_sequence[i + 1]

                        if (frontContext != None):
                            counts['5prime']['fwd'][frontContext + "T"] += 1
                        if (backContext != None):
                            counts['3prime']['fwd']["T" + backContext] += 1

                    if (read.query_sequence[i] == "A" and read.is_reverse):
                        frontContext = None
                        backContext = None
                        if (i > 0):
                            backContext = read.query_sequence[i - 1]
                        if (i < (len(read.query_sequence) - 1)):
                            frontContext = read.query_sequence[i + 1]

                        if (frontContext != None):
                            counts['5prime']['rev'][complement(frontContext +
                                                               "A")] += 1
                        if (backContext != None):
                            counts['3prime']['rev'][complement(
                                "A" + backContext)] += 1

                    i += 1

        # Print rates in correct format for plotting
        fo = open(outputCSV, "w")

        print("\t".join(frontCombinations), file=fo)

        frontFwdLine = ""
        frontRevLine = ""
        backFwdLine = ""
        backRevLine = ""

        for combination in frontCombinations:
            frontFwdLine += str(counts['5prime']['fwd'][combination]) + "\t"
            frontRevLine += str(counts['5prime']['rev'][combination]) + "\t"

        print(frontFwdLine.rstrip(), file=fo)
        print(frontRevLine.rstrip(), file=fo)

        print("\t".join(backCombinations), file=fo)

        for combination in backCombinations:
            backFwdLine += str(counts['3prime']['fwd'][combination]) + "\t"
            backRevLine += str(counts['3prime']['rev'][combination]) + "\t"

        print(backFwdLine.rstrip(), file=fo)
        print(backRevLine.rstrip(), file=fo)

        fo.close()

    if (not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing overall rate pdfs for file " + bam, file=log)
    else:
        f = tempfile.NamedTemporaryFile(delete=False)
        print(removeExtension(os.path.basename(bam)),
              outputCSV,
              sep='\t',
              file=f)
        f.close()

        callR(getPlotter("compute_context_TC_rates") + " -f " + f.name +
              " -O " + outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #19
0
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False):
    if(printOnly or checkStep([inputBAM], [outputBAM], force)):
        
        mappedReads = 0
        unmappedReads = 0
        filteredReads = 0
        
        mqFiltered = 0
        idFiltered = 0
        nmFiltered = 0
        multimapper = 0
        
        infile = pysam.AlignmentFile(inputBAM, "rb")    
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)
        
        # Default filtering without bed
        if (bed == None) :
            
            print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log)
            
            for read in infile:
                
                if(not read.is_secondary and not read.is_supplementary):
                    if(read.is_unmapped):
                        unmappedReads += 1
                    else:
                        mappedReads += 1
                
                if(read.is_unmapped):
                    continue
                if(read.mapping_quality < MQ):
                    mqFiltered += 1
                    continue
                if(float(read.get_tag("XI")) < minIdentity):
                    idFiltered += 1
                    continue
                if(NM > -1 and int(read.get_tag("NM")) > NM):
                    nmFiltered += 1
                    continue
                
                if(not read.is_secondary and not read.is_supplementary):
                    filteredReads += 1
                    
                outfile.write(read)
                
            print("Criterion\tFiltered reads",file=log)
            print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log)
            print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log)
            print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log)
            print("MM\t0",file=log)
        else :
            # Multimap retention strategy filtering when bed is supplied
            
            random.seed(1)
            
            print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log)
            
            mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) 
            #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)
        
        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if (bed != None) :
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else :
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""
            
            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
            #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"        
        
        slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ }
        if('PG' in inFileBamHeader):
            inFileBamHeader['PG'].append(slamDunkPG)
        else:
            inFileBamHeader['PG'] = [ slamDunkPG ]
        
        infile.close()
        outfile.close()
        
        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, verbose)
        
        pysamIndex(outputBAM)
        #pysamFlagstat(outputBAM)
        #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)
    
    else:
        print("Skipped filtering for " + inputBAM, file=log)
Пример #20
0
def Filter(inputBAM,
           outputBAM,
           log,
           bed,
           MQ=2,
           minIdentity=0.8,
           NM=-1,
           printOnly=False,
           verbose=True,
           force=False):
    if (printOnly or checkStep([inputBAM], [outputBAM], force)):

        mappedReads = 0
        unmappedReads = 0
        filteredReads = 0

        mqFiltered = 0
        idFiltered = 0
        nmFiltered = 0
        multimapper = 0

        infile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)

        # Default filtering without bed
        if (bed == None):

            print("#No bed-file supplied. Running default filtering on " +
                  inputBAM + ".",
                  file=log)

            for read in infile:

                if (not read.is_secondary and not read.is_supplementary):
                    if (read.is_unmapped):
                        unmappedReads += 1
                    else:
                        mappedReads += 1

                if (read.is_unmapped):
                    continue
                if (read.mapping_quality < MQ):
                    mqFiltered += 1
                    continue
                if (float(read.get_tag("XI")) < minIdentity):
                    idFiltered += 1
                    continue
                if (NM > -1 and int(read.get_tag("NM")) > NM):
                    nmFiltered += 1
                    continue

                if (not read.is_secondary and not read.is_supplementary):
                    filteredReads += 1

                outfile.write(read)

            print("Criterion\tFiltered reads", file=log)
            print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log)
            print("ID < " + str(minIdentity) + "\t" + str(idFiltered),
                  file=log)
            print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log)
            print("MM\t0", file=log)
        else:
            # Multimap retention strategy filtering when bed is supplied

            random.seed(1)

            print(
                "#Bed-file supplied. Running multimap retention filtering strategy on "
                + inputBAM + ".",
                file=log)

            mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment(
                infile, outfile, bed, minIdentity, NM, log)
            #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)

        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if (bed != None):
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else:
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""

            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
            #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"

        slamDunkPG = {
            'ID': 'slamdunk',
            'PN': 'slamdunk filter v' + __version__,
            'VN': __bam_version__
        }
        if ('PG' in inFileBamHeader):
            inFileBamHeader['PG'].append(slamDunkPG)
        else:
            inFileBamHeader['PG'] = [slamDunkPG]

        infile.close()
        outfile.close()

        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, verbose)

        pysamIndex(outputBAM)
        #pysamFlagstat(outputBAM)
        #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)

    else:
        print("Skipped filtering for " + inputBAM, file=log)
Пример #21
0
def computeSNPMaskedRates (ref, bed, snpsFile, bam, maxReadLength, minQual, coverageCutoff, variantFraction, outputCSV, outputPDF, strictTCs, log, printOnly=False, verbose=True, force=False):
    
    if(not checkStep([bam, ref], [outputCSV], force)):
        print("Skipped computing T->C per UTR with SNP masking for file " + bam, file=log)
    else:    
        fileCSV = open(outputCSV,'w')
        
        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()
        
        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, ref, snps)
                                 
        progress = 0
        for utr in BedIterator(bed):
            
            if(not utr.hasStrand()):
                raise RuntimeError("Input BED file does not contain stranded intervals.")
            
            if utr.start < 0:
                raise RuntimeError("Negativ start coordinate found. Please check the following entry in your BED file: " + utr)
    
            readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual)
            
            unmaskedTCCount = 0
            maskedTCCount = 0
            readCount = 0
            
            for read in readIterator:
                
                # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
                if (not read.isTcRead and strictTCs) :
                    read.tcCount = 0
                    read.mismatches = []
                    read.conversionRates = 0.0
                    read.tcRate = 0.0
                    
                isTC = False
                isTrueTC = False
                
                for mismatch in read.mismatches:
                    if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()):
                        isTrueTC = True
                    
                    unmasked = False
                    if (read.direction == ReadDirection.Reverse and mismatch.referenceBase == "A" and mismatch.readBase == "G"):
                        unmasked = True
                    elif (read.direction != ReadDirection.Reverse and mismatch.referenceBase == "T" and mismatch.readBase == "C") :
                        unmasked = True
                        
                    if (unmasked and mismatch.referencePosition >= 0 and mismatch.referencePosition < utr.getLength()) :
                        isTC = True
                        
                readCount += 1
                
                if (isTC) :
                    unmaskedTCCount += 1
                    
                if (isTrueTC) :
                    maskedTCCount += 1
            
            containsSNP = 0
            
            if (unmaskedTCCount != maskedTCCount) :
                containsSNP = 1
                
            print(utr.name + "\t" + str(readCount) + "\t" + str(unmaskedTCCount) + "\t" + str(maskedTCCount) + "\t" + str(containsSNP), file=fileCSV)
                   
            progress += 1
            
        fileCSV.close()
    
    if(not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam, file=log)
    else: 
        callR(getPlotter("SNPeval") + " -i " + outputCSV + " -c " + str(coverageCutoff) + " -v " + str(variantFraction) + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)            
Пример #22
0
def tcPerUtr(referenceFile, utrBed, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False):
        
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per UTR position for file " + bam, file=log)
    else:
    
        counter = 0
            
        totalUtrCountFwd = [0] * utrNormFactor
        totalUtrCountRev = [0] * utrNormFactor
        
        tcPerPosRev = [0] * utrNormFactor
        tcPerPosFwd = [0] * utrNormFactor
         
        allPerPosRev = [0] * utrNormFactor
        allPerPosFwd = [0] * utrNormFactor
        
        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()
        
        # Go through one utr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)
        
        for utr in BedIterator(utrBed):
                                         
            readIterator = testFile.readInRegion(utr.chromosome, utr.start, utr.stop, utr.strand, maxReadLength, minQual)
            
            tcForwardCounts = [0] * utrNormFactor
            mutForwardCounts = [0] * utrNormFactor
            tcReverseCounts = [0] * utrNormFactor
            mutReverseCounts = [0] * utrNormFactor
            
            for read in readIterator:
                
                tcCounts = [0] * utrNormFactor
                mutCounts = [0] * utrNormFactor
                
                for mismatch in read.mismatches:
                             
                    mismatchPos = mismatch.referencePosition

                    # mismatchPos = read.startRefPos
                        
                    if (utr.strand == "+") :
                                                
                        # New try for UTRs (remove + 1
                        if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength()) :
                        # if (mismatchPos >= (utr.getLength() - utrNormFactor) and mismatchPos < utr.getLength() + 1) :
                            mismatchPos = utrNormFactor - (utr.getLength() - mismatchPos)
                            
                            if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else :
                                mutCounts[mismatchPos] += 1                    
                    else :
                        
                        if (mismatchPos >= 0 and mismatchPos < min(utr.getLength(), utrNormFactor)) :
                            if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)):
                                tcCounts[mismatchPos] += 1
                            else :
                                mutCounts[mismatchPos] += 1
                            
                if(read.direction == ReadDirection.Reverse):
                    
                    tcReverseCounts = sumLists(tcReverseCounts, tcCounts)
                    mutReverseCounts = sumLists(mutReverseCounts, mutCounts)
                    
                    start = max(0, min(min(utr.getLength(), utrNormFactor), read.startRefPos))
                    end = max(0, min(min(utr.getLength(), utrNormFactor), read.endRefPos))
                    
                    for i in range(start, end):
                        
                        totalUtrCountRev[i] += 1
                     
                else:
                            
                    tcForwardCounts = sumLists(tcForwardCounts, tcCounts)
                    mutForwardCounts = sumLists(mutForwardCounts, mutCounts)
                    
                    start = min(utr.getLength(), max(utr.getLength() - utrNormFactor, read.startRefPos))
                    end = min(utr.getLength(), max(utr.getLength() - utrNormFactor, read.endRefPos))
                
                    for i in range(start, end):
                        normPos = utrNormFactor - (utr.getLength() - i)
                        totalUtrCountFwd[normPos] += 1                 
                        
            tcPerPosFwd = sumLists(tcPerPosFwd, tcForwardCounts)
            allPerPosFwd = sumLists(allPerPosFwd, mutForwardCounts)
             
            tcPerPosRev = sumLists(tcPerPosRev, tcReverseCounts)
            allPerPosRev = sumLists(allPerPosRev, mutReverseCounts)
            
            counter += 1
            
            if (verbose and counter % 10000 == 0) :
                print("Handled " + str(counter) + " UTRs.", file=log)
    
        foTC = open(outputCSV, "w")
        
        print("# slamdunk tcperutr v" + __version__, file=foTC)
        
        reverseAllPerPosRev = allPerPosRev[::-1]
        reverseTcPerPosRev = tcPerPosRev[::-1]
        reverseTotalUtrCountRev = totalUtrCountRev[::-1]

        for i in range(0, utrNormFactor):
            print(allPerPosFwd[i], reverseAllPerPosRev[i], tcPerPosFwd[i], reverseTcPerPosRev[i], totalUtrCountFwd[i], reverseTotalUtrCountRev[i], sep='\t', file=foTC)
        foTC.close()
       
    if(not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per UTR position plot for file " + bam, file=log)
    else: 
        callR(getPlotter("conversion_per_read_position") + " -u -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
Пример #23
0
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False):
    
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam, file=log)
    else:
        
        totalReadCountFwd = [0] * maxReadLength
        totalReadCountRev = [0] * maxReadLength
        
        tcPerPosRev = [0] * maxReadLength
        tcPerPosFwd = [0] * maxReadLength
        
        allPerPosRev = [0] * maxReadLength
        allPerPosFwd = [0] * maxReadLength

        
        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()
        
        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)
        
        chromosomes = testFile.getChromosomes()
        
        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minQual)
                
            for read in readIterator:
                
                tcCounts = [0] * maxReadLength
                mutCounts = [0] * maxReadLength
                
                for mismatch in read.mismatches:
                    if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)):
                        tcCounts[mismatch.readPosition] += 1
                    else :
                        mutCounts[mismatch.readPosition] += 1
                        
                
                query_length = len(read.sequence)
                if(read.direction == ReadDirection.Reverse):
                    tcPerPosRev = sumLists(tcPerPosRev, tcCounts)
                    allPerPosRev = sumLists(allPerPosRev, mutCounts)
                    
                    for i in range(0, query_length):
                        totalReadCountRev[i] += 1
                else:
                    tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts)
                    allPerPosFwd = sumLists(allPerPosFwd, mutCounts)
                    
                    for i in range(0, query_length):
                        totalReadCountFwd[i] += 1
                        

        foTC = open(outputCSV, "w")
        
        print("# slamdunk tcperreadpos v" + __version__, file=foTC)
        
        for i in range(0, maxReadLength):
            print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC)
        foTC.close()
       
    if(not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per reads position plot for file " + bam, file=log)
    else: 
        callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
Пример #24
0
def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose = True, force=False):
    
    if(printOnly or checkStep([inputBAM], [outputBAM], force)):
        
        samfile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile)
        
        processedReads = 0
        retainedReads = 0

        prevChr = ""
        prevStart = ""
        
        duplicateBuffer = {}
        
        for read in samfile:
            
            flag = read.cigarstring
            chr = read.reference_id
            start = read.reference_start
            seq = read.query_sequence
            if (read.has_tag("TC")) :
                tcflag = read.get_tag("TC")
            else :
                tcflag = 0
            
            if (tcflag >= tcMutations) :
                
                if (chr != prevChr or start != prevStart) :
                                
                    if (prevChr != "") :
                        for curSeq in duplicateBuffer :
                            for curFlag in duplicateBuffer[curSeq]:
                                for readEntry in duplicateBuffer[curSeq][curFlag]:
                                    if not readEntry.is_duplicate:
                                       retainedReads += 1 
                                    outfile.write(readEntry)
                        duplicateBuffer.clear()
                
                if not seq in duplicateBuffer:
                    duplicateBuffer[seq] = {}
                if not flag in duplicateBuffer[seq]:
                    duplicateBuffer[seq][flag] = list()
                if len(duplicateBuffer[seq][flag]) > 0 :
                    read.is_duplicate = True
                duplicateBuffer[seq][flag].append(read)
                 
                prevChr = chr
                prevStart = start
            
                processedReads += 1
            
        for seq in duplicateBuffer:
            for flag in duplicateBuffer[seq] :
                for readEntry in duplicateBuffer[seq][flag]:
                    if not readEntry.is_duplicate:
                        retainedReads += 1 
                    outfile.write(readEntry)
        duplicateBuffer.clear()
        
        outfile.close()
                
        print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end = "")
        print("{0:.2f}".format(float(retainedReads) / float(processedReads)),file=log,end="")
        print(" compression rate)", file=log)
        
        pysamIndex(outputBAM)
        
    else:
        print("Skipped deduplication for " + inputBAM, file=log)
Пример #25
0
def statsComputeOverallRatesPerUTR(referenceFile,
                                   bam,
                                   minBaseQual,
                                   strictTCs,
                                   outputCSV,
                                   outputPDF,
                                   utrBed,
                                   maxReadLength,
                                   log,
                                   printOnly=False,
                                   verbose=True,
                                   force=False):

    sampleInfo = getSampleInfo(bam)

    slamseqInfo = SlamSeqInfo(bam)

    if (not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:

        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)

        # UTR stats for MultiQC
        utrStats = dict()

        plotConversions = [
            'A>T',
            'A>G',
            'A>C',
            'C>A',
            'C>G',
            'C>T',
            'G>A',
            'G>C',
            'G>T',
            'T>A',
            'T>G',
            'T>C',
        ]

        for conversion in plotConversions:
            utrStats[conversion] = list()

        f = tempfile.NamedTemporaryFile(delete=False)

        for utr in BedIterator(utrBed):

            readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                                 utr.stop, utr.strand,
                                                 maxReadLength, minBaseQual)

            # Init
            totalRates = [0] * 25

            readCount = 0
            for read in readIterator:

                if (not read.isTcRead and strictTCs and read.tcCount > 0):
                    pass
                else:

                    # Compute rates for current read
                    rates = read.conversionRates

                    # Add rates from read to total rates
                    totalRates = sumLists(totalRates, rates)
                    readCount += 1

            print(utr.name,
                  utr.chromosome,
                  utr.start,
                  utr.stop,
                  utr.strand,
                  readCount,
                  "\t".join(str(x) for x in totalRates),
                  sep="\t",
                  file=f)

            # Process rates for MultiQC
            # Copied directly, too lazy to do it properly now

            utrDict = {}

            conversionSum = 0

            A_A = totalRates[0]
            conversionSum = +A_A
            A_C = totalRates[1]
            conversionSum = +A_C
            A_G = totalRates[2]
            conversionSum = +A_G
            A_T = totalRates[3]
            conversionSum = +A_T

            C_A = totalRates[5]
            conversionSum = +C_A
            C_C = totalRates[6]
            conversionSum = +C_C
            C_G = totalRates[7]
            conversionSum = +C_G
            C_T = totalRates[8]
            conversionSum = +C_T

            G_A = totalRates[10]
            conversionSum = +G_A
            G_C = totalRates[11]
            conversionSum = +G_C
            G_G = totalRates[12]
            conversionSum = +G_G
            G_T = totalRates[13]
            conversionSum = +G_T

            T_A = totalRates[15]
            conversionSum = +T_A
            T_C = totalRates[16]
            conversionSum = +T_C
            T_G = totalRates[17]
            conversionSum = +T_G
            T_T = totalRates[18]
            conversionSum = +T_T

            if utr.strand == "-":

                A_A, T_T = T_T, A_A
                G_G, C_C = C_C, G_G
                A_C, T_G = T_G, A_C
                A_G, T_C = T_C, A_G
                A_T, T_A = T_A, A_T
                C_A, G_T = G_T, C_A
                C_G, G_C = G_C, C_G
                C_T, G_A = G_A, C_T

            if conversionSum > 0:

                Asum = A_A + A_C + A_G + A_T
                Csum = C_A + C_C + C_G + C_T
                Gsum = G_A + G_C + G_G + G_T
                Tsum = T_A + T_C + T_G + T_T

                if Asum > 0:
                    A_T = A_T / float(Asum) * 100
                    A_G = A_G / float(Asum) * 100
                    A_C = A_C / float(Asum) * 100
                else:
                    A_T = 0
                    A_G = 0
                    A_C = 0
                if Csum > 0:
                    C_A = C_A / float(Csum) * 100
                    C_G = C_G / float(Csum) * 100
                    C_T = C_T / float(Csum) * 100
                else:
                    C_A = 0
                    C_G = 0
                    C_T = 0
                if Gsum > 0:
                    G_A = G_A / float(Gsum) * 100
                    G_C = G_C / float(Gsum) * 100
                    G_T = G_T / float(Gsum) * 100
                else:
                    G_A = 0
                    G_C = 0
                    G_T = 0
                if Tsum > 0:
                    T_A = T_A / float(Tsum) * 100
                    T_G = T_G / float(Tsum) * 100
                    T_C = T_C / float(Tsum) * 100
                else:
                    T_A = 0
                    T_G = 0
                    T_C = 0

                utrStats['A>T'].append(A_T)
                utrStats['A>G'].append(A_G)
                utrStats['A>C'].append(A_C)

                utrStats['C>A'].append(C_A)
                utrStats['C>G'].append(C_G)
                utrStats['C>T'].append(C_T)

                utrStats['G>A'].append(G_A)
                utrStats['G>T'].append(G_T)
                utrStats['G>C'].append(G_C)

                utrStats['T>A'].append(T_A)
                utrStats['T>G'].append(T_G)
                utrStats['T>C'].append(T_C)

        f.close()

        fo = open(outputCSV, "w")

        print("# slamdunk utrrates v" + __version__, file=fo)

        print("# Median-Conversions=", end="", file=fo)

        first = True
        for conversion in plotConversions:
            if (not first):
                print(',', file=fo, end="")
            else:
                first = False
            print(conversion + ":" + str(np.median(utrStats[conversion])),
                  file=fo,
                  end="")
        print(file=fo)

        print("Name",
              "Chr",
              "Start",
              "End",
              "Strand",
              "ReadCount",
              sep="\t",
              end="\t",
              file=fo)
        for i in range(0, 5):
            for j in range(0, 5):
                print(toBase[i].upper() + "_" + toBase[j].upper(),
                      end="",
                      file=fo)
                if (i != 4 or j != 4):
                    print("\t", end="", file=fo)
        print(file=fo)

        with open(f.name, "rb") as valueFile:
            fo.write(valueFile.read())

        fo.close()

    if (not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing global rate pdfs for file " + bam, file=log)
    else:
        f = tempfile.NamedTemporaryFile(delete=False)
        print(sampleInfo.Name, outputCSV, sep='\t', file=f)
        f.close()

        callR(getPlotter("globalRatePlotter") + " -f " + f.name + " -O " +
              outputPDF,
              log,
              dry=printOnly,
              verbose=verbose)
Пример #26
0
def statsComputeTCContext(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False):
     
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:
        # Init
        # combinations = ["AT","CT","GT","TT","NT","AA","CA","GA","TA","NA"]
        frontCombinations = ["AT", "CT", "GT", "TT", "NT"]
        backCombinations = ["TA", "TC", "TG", "TT", "TN"]
         
        counts = {}
        counts['5prime'] = {}
        counts['3prime'] = {}
        counts['5prime']['fwd'] = {}
        counts['5prime']['rev'] = {}
        counts['3prime']['fwd'] = {}
        counts['3prime']['rev'] = {}
         
        for combination in frontCombinations :
            counts['5prime']['fwd'][combination] = 0
            counts['5prime']['rev'][combination] = 0
             
        for combination in backCombinations:
            counts['3prime']['fwd'][combination] = 0
            counts['3prime']['rev'][combination] = 0
             
        bamFile = pysam.AlignmentFile(bam, "rb")
         
        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)
         
        chromosomes = testFile.getChromosomes()
         
        for chromosome in chromosomes:
                 
            for read in bamFile.fetch(region=chromosome):
                 
                i = 0
                while i < len(read.query_sequence):
                    if(read.query_sequence[i] == "T" and not read.is_reverse) :
                        frontContext = None
                        backContext = None
                        if (i > 0) :
                            frontContext = read.query_sequence[i - 1]
                        if (i < (len(read.query_sequence) - 1)) :
                            backContext  = read.query_sequence[i + 1]
                         
                        if (frontContext != None) :
                            counts['5prime']['fwd'][frontContext + "T"] += 1
                        if (backContext != None) :
                            counts['3prime']['fwd']["T" + backContext] += 1
                             
                    if(read.query_sequence[i] == "A" and read.is_reverse) :
                        frontContext = None
                        backContext = None
                        if (i > 0) :
                            backContext = read.query_sequence[i - 1]
                        if (i < (len(read.query_sequence) - 1)) :
                            frontContext  = read.query_sequence[i + 1]
                         
                        if (frontContext != None) :
                            counts['5prime']['rev'][complement(frontContext + "A")] += 1
                        if (backContext != None) :
                            counts['3prime']['rev'][complement("A" + backContext)] += 1
                     
                    i += 1
         
        # Print rates in correct format for plotting
        fo = open(outputCSV, "w")
         
        print("\t".join(frontCombinations), file=fo)
         
        frontFwdLine = ""
        frontRevLine = ""
        backFwdLine = ""
        backRevLine = ""
         
        for combination in frontCombinations :
            frontFwdLine += str(counts['5prime']['fwd'][combination]) + "\t"
            frontRevLine += str(counts['5prime']['rev'][combination]) + "\t"
         
        print(frontFwdLine.rstrip(), file=fo)
        print(frontRevLine.rstrip(), file=fo)
         
        print("\t".join(backCombinations), file=fo)
 
        for combination in backCombinations :
            backFwdLine += str(counts['3prime']['fwd'][combination]) + "\t"
            backRevLine += str(counts['3prime']['rev'][combination]) + "\t"
 
        print(backFwdLine.rstrip(), file=fo)
        print(backRevLine.rstrip(), file=fo)
         
        fo.close()
     
    if(not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing overall rate pdfs for file " + bam, file=log)
    else:
        f = tempfile.NamedTemporaryFile(delete=False)
        print(removeExtension(os.path.basename(bam)), outputCSV, sep='\t', file=f)
        f.close()
         
        callR(getPlotter("compute_context_TC_rates") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)