示例#1
0
def runCollapse(tid, tcount, outputDirectory) :
    outputTCOUNT = os.path.join(outputDirectory, replaceExtension(basename(tcount), ".csv", "_collapsed"))
    outputLOG = os.path.join(outputDirectory, replaceExtension(basename(tcount), ".log", "_collapsed"))
    log = getLogFile(outputLOG)
    tcounter.collapse(tcount, outputTCOUNT, log)
    closeLogFile(log)
    stepFinished()
示例#2
0
def runSNPeval(tid, bam, ref, bed, maxLength, minQual, coverageCutoff,
               variantFraction, strictTCs, outputDirectory, snpDirectory):

    outputCSV = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".csv", "_SNPeval"))
    outputPDF = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".pdf", "_SNPeval"))
    outputLOG = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".log", "_SNPeval"))

    if (not os.path.isdir(snpDirectory)):
        print("SNP directory does not exists. Abort.")
        sys.exit(0)

    inputSNP = os.path.join(snpDirectory,
                            replaceExtension(basename(bam), ".vcf", "_snp"))

    if (maxLength == None):
        maxLength = estimateMaxReadLength(bam)
    if (maxLength < 0):
        print(
            "Could not reliable estimate maximum read length. Please specify --max-read-length parameter."
        )
        sys.exit(0)

    log = getLogFile(outputLOG)

    print("Using " + str(maxLength) + " as maximum read length.", file=log)

    stats.computeSNPMaskedRates(ref, bed, inputSNP, bam, maxLength, minQual,
                                coverageCutoff, variantFraction, outputCSV,
                                outputPDF, strictTCs, log)
    stepFinished()
示例#3
0
def runTcPerUtr(tid, bam, referenceFile, bed, minMQ, maxReadLength,
                outputDirectory, snpDirectory, vcfFile):
    outputCSV = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".csv", "_tcperutr"))
    outputPDF = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".pdf", "_tcperutr"))
    outputLOG = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".log", "_tcperutr"))

    if (vcfFile != None):
        inputSNP = vcfFile
    elif (snpDirectory != None):
        inputSNP = os.path.join(
            snpDirectory, replaceExtension(basename(bam), ".vcf", "_snp"))
    else:
        inputSNP = None

    if (maxReadLength == None):
        maxReadLength = estimateMaxReadLength(bam)
    if (maxReadLength < 0):
        print(
            "Could not reliable estimate maximum read length. Please specify --max-read-length parameter."
        )
        sys.exit(0)

    log = getLogFile(outputLOG)

    print("Using " + str(maxReadLength) + " as maximum read length.", file=log)

    stats.tcPerUtr(referenceFile, bed, bam, minMQ, maxReadLength, outputCSV,
                   outputPDF, inputSNP, log, False, True, True)

    closeLogFile(log)
    stepFinished()
示例#4
0
def runCount(tid, bam, ref, bed, maxLength, minQual, conversionThreshold,
             outputDirectory, snpDirectory):
    outputCSV = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".tsv", "_tcount"))
    outputBedgraphPlus = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".bedgraph", "_tcount_plus"))
    outputBedgraphMinus = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".bedgraph", "_tcount_mins"))
    outputLOG = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".log", "_tcount"))
    if (snpDirectory != None):
        inputSNP = os.path.join(
            snpDirectory, replaceExtension(basename(bam), ".vcf", "_snp"))
    else:
        inputSNP = None

    if (maxLength == None):
        maxLength = estimateMaxReadLength(bam)
    if (maxLength < 0):
        print(
            "Difference between minimum and maximum read length is > 10. Please specify --max-read-length parameter."
        )
        sys.exit(0)

    log = getLogFile(outputLOG)

    print("Using " + str(maxLength) + " as maximum read length.", file=log)

    tcounter.computeTconversions(ref, bed, inputSNP, bam, maxLength, minQual,
                                 outputCSV, outputBedgraphPlus,
                                 outputBedgraphMinus, conversionThreshold, log)
    stepFinished()
    return outputCSV
示例#5
0
def runStatsRatesUTR(tid, bam, referenceFile, minMQ, strictTCs,
                     outputDirectory, utrFile, maxReadLength):
    outputCSV = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".csv", "_mutationrates_utr"))
    outputPDF = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".pdf", "_mutationrates_utr"))
    outputLOG = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".log", "_mutationrates_utr"))

    if (maxReadLength == None):
        maxReadLength = estimateMaxReadLength(bam)
    if (maxReadLength < 0):
        print(
            "Could not reliable estimate maximum read length. Please specify --max-read-length parameter."
        )
        sys.exit(0)

    log = getLogFile(outputLOG)

    print("Using " + str(maxReadLength) + " as maximum read length.", file=log)

    stats.statsComputeOverallRatesPerUTR(referenceFile, bam, minMQ, strictTCs,
                                         outputCSV, outputPDF, utrFile,
                                         maxReadLength, log)
    closeLogFile(log)
    stepFinished()
示例#6
0
def reads(outputDirectory, bed, sampleName, readLenght, readNumber, readCoverage, seqError, pulseTimePoint, chaseTimePoint, conversionRate, sampleInfo, labledTranscripots = -1.0):
    message("Simulating read sample: " + sampleName)

    bed12File = replaceExtension(bed, ".bed12")
    bed12FastaFile = replaceExtension(bed, ".fa")
    explvFile = replaceExtension(bed, ".eplv")

    bedReads = os.path.join(outputDirectory, sampleName + "_reads_tmp.bed")
    faReads = os.path.join(outputDirectory, sampleName + "_reads_tmp.fa")

    totalUTRlength = simulator.getTotalUtrLength(bed12File)

    if(readNumber == 0):
        readNumber = (totalUTRlength / readLenght) *  readCoverage
        readNumber = int(readNumber * (random.uniform(-0.2, 0.2) + 1))

    #message("Simulating " + str(readNumber) + " reads with sequencing error of " + str(seqError))
    simulator.simulateReads(bed12File, bed12FastaFile, explvFile, bedReads, faReads, readLenght, readNumber, seqError)

    bamReadsWithTC = os.path.join(outputDirectory, sampleName + "_reads.bam")
    utrSummary = os.path.join(outputDirectory, sampleName + "_utrsummary.tsv")

    simulator.addTcConversions(bed, faReads, bamReadsWithTC, pulseTimePoint, chaseTimePoint, utrSummary, conversionRate, readNumber, sampleInfo, labledTranscripots)

    os.unlink(faReads)
    os.unlink(bedReads)
示例#7
0
def runDedup(tid, bam, outputDirectory, tcMutations) :
    outputBAM = os.path.join(outputDirectory, replaceExtension(basename(bam), ".bam", "_dedup"))
    outputLOG = os.path.join(outputDirectory, replaceExtension(basename(bam), ".log", "_dedup"))
    log = getLogFile(outputLOG)
    deduplicator.Dedup(bam, outputBAM, tcMutations, log)
    closeLogFile(log)
    stepFinished()
示例#8
0
def runHalfLifes(bams, timepoints, outputDirectory) :
    outputCSV = os.path.join(outputDirectory, replaceExtension(basename(bams[0]), ".tsv", "_halflifes"))
    outputLOG = os.path.join(outputDirectory, replaceExtension(basename(bams[0]), ".log", "_halflifes"))
    log = getLogFile(outputLOG)
    stats.halflifes(",".join(bams), outputCSV, timepoints, log)
    closeLogFile(log)
    stepFinished()
示例#9
0
def readSummary(filteredFiles, countDirectory, outputFile, log, printOnly=False, verbose=True, force=False):
    
    # Print sort by ID
    contentDict = {}
    
    tsvFile = open(outputFile, "w")
    
    print("# slamdunk summary v" + __version__, file=tsvFile)

    if (countDirectory != None) :
        f = tempfile.NamedTemporaryFile(delete=False)

    for bam in filteredFiles:
        slamseqInfo = SlamSeqInfo(bam)
        sampleInfo = getSampleInfo(bam)
        
        if (countDirectory != None) :
            
            countedReads = 0
            
            countFile = os.path.join(countDirectory, replaceExtension(os.path.basename(bam), ".tsv", "_tcount"))
            if not os.path.exists(countFile):
                print("TCount directory does not seem to contain tcount file for:\t" + countFile)
            else :
                print(sampleInfo.Name, countFile, sep='\t', file=f)
                countedReads = sumCounts(countFile)
            
            if(int(sampleInfo.ID) in contentDict):
                ID = len(contentDict) + 1
            else:
                ID = sampleInfo.ID
        
            contentDict[int(ID)] = "\t".join([bam, sampleInfo.Name, sampleInfo.Type, sampleInfo.Time, str(slamseqInfo.SequencedReads), str(slamseqInfo.MappedReads), str(slamseqInfo.DedupReads), str(slamseqInfo.MQFilteredReads), str(slamseqInfo.IdFilteredReads), str(slamseqInfo.NmFilteredReads), str(slamseqInfo.MultimapperReads), str(slamseqInfo.FilteredReads), str(countedReads), slamseqInfo.AnnotationName])
        
        else :
            
            if(int(sampleInfo.ID) in contentDict):
                ID = len(contentDict) + 1
            else:
                ID = sampleInfo.ID
        
            contentDict[int(ID)] = "\t".join([bam, sampleInfo.Name, sampleInfo.Type, sampleInfo.Time, str(slamseqInfo.SequencedReads), str(slamseqInfo.MappedReads), str(slamseqInfo.DedupReads), str(slamseqInfo.MQFilteredReads), str(slamseqInfo.IdFilteredReads), str(slamseqInfo.NmFilteredReads), str(slamseqInfo.MultimapperReads), str(slamseqInfo.FilteredReads), slamseqInfo.AnnotationName])
          
    if (countDirectory != None) :
        
        f.close()
        
        callR(getPlotter("PCAPlotter") + " -f " + f.name + " -O " + replaceExtension(outputFile, ".pdf", "_PCA") + " -P " + replaceExtension(outputFile, ".txt", "_PCA"), log, dry=printOnly, verbose=verbose)
        
        print("FileName", "SampleName", "SampleType", "SampleTime", "Sequenced", "Mapped", "Deduplicated", "MQ-Filtered", "Identity-Filtered", "NM-Filtered", "Multimap-Filtered", "Retained", "Counted", "Annotation", sep="\t", file=tsvFile)
        
        
    else :
        print("FileName", "SampleName", "SampleType", "SampleTime", "Sequenced", "Mapped", "Deduplicated", "MQ-Filtered", "Identity-Filtered", "NM-Filtered", "Multimap-Filtered", "Retained", "Annotation", sep="\t", file=tsvFile)
            
    for key in sorted(contentDict):
        print(contentDict[key], file=tsvFile)
        
    tsvFile.close()
示例#10
0
def runStatsTCContext(tid, bam, referenceFile, minMQ, outputDirectory) :
    outputCSV = os.path.join(outputDirectory, replaceExtension(basename(bam), ".csv", "_tccontext"))
    outputPDF = os.path.join(outputDirectory, replaceExtension(basename(bam), ".pdf", "_tccontext"))
    outputLOG = os.path.join(outputDirectory, replaceExtension(basename(bam), ".log", "_tccontext"))
    log = getLogFile(outputLOG)
    stats.statsComputeTCContext(referenceFile, bam, minMQ, outputCSV, outputPDF, log)
    closeLogFile(log)
    stepFinished()
示例#11
0
def runFilter(tid, bam, bed, mq, minIdentity, maxNM, outputDirectory):
    outputBAM = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".bam", "_filtered"))
    outputLOG = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".log", "_filtered"))
    filter.Filter(bam, outputBAM, getLogFile(outputLOG), bed, mq, minIdentity,
                  maxNM, printOnly, verbose)
    stepFinished()
示例#12
0
def runMap(tid, inputBAMrun1, inputBAMrun2, referenceFile, threads, trim5p,
           maxPolyA, quantseqMapping, endtoendMapping, topn, sampleDescription,
           outputDirectory, skipSAM):
    if skipSAM:
        outputSAM = os.path.join(
            outputDirectory,
            replaceExtension(basename(inputBAMrun1), ".bam",
                             "_slamdunk_mapped"))
    else:
        outputSAM = os.path.join(
            outputDirectory,
            replaceExtension(basename(inputBAMrun1), ".sam",
                             "_slamdunk_mapped"))
    outputLOG = os.path.join(
        outputDirectory,
        replaceExtension(basename(inputBAMrun1), ".log", "_slamdunk_mapped"))

    #sampleName = "sample_" + str(tid)
    sampleName = replaceExtension(basename(inputBAMrun1), ".bam", "")
    sampleType = "NA"
    sampleTime = "-1"
    if (sampleDescription != ""):
        sampleDescriptions = sampleDescription.split(":")
        if (len(sampleDescriptions) >= 1):
            sampleName = sampleDescriptions[0]
        if (len(sampleDescriptions) >= 2):
            typeDict = {
                'p': 'pulse',
                'c': 'chase',
                'pulse': 'pulse',
                'chase': 'chase',
                '': 'NA'
            }
            if sampleDescriptions[1] in typeDict:
                sampleType = typeDict[sampleDescriptions[1]]
            else:
                sampleType = sampleDescriptions[1]
        if (len(sampleDescriptions) >= 3):
            sampleTime = sampleDescriptions[2]

    mapper.Map(inputBAMrun1,
               inputBAMrun2,
               referenceFile,
               outputSAM,
               getLogFile(outputLOG),
               quantseqMapping,
               endtoendMapping,
               threads=threads,
               trim5p=trim5p,
               maxPolyA=maxPolyA,
               topn=topn,
               sampleId=tid,
               sampleName=sampleName,
               sampleType=sampleType,
               sampleTime=sampleTime,
               printOnly=printOnly,
               verbose=verbose)
    stepFinished()
示例#13
0
    def Utrs(outputDirectory, bed, referenceFasta, readLength, polyALength, snpRate):
        message("Simulating UTRs")
        createDir(outputDirectory)
        bed12 = os.path.join(outputDirectory, replaceExtension(basename(bed), ".bed12", "_utrs"))
        bed12Fasta = os.path.join(outputDirectory, replaceExtension(basename(bed), ".fa", "_utrs"))
        explv = os.path.join(outputDirectory, replaceExtension(basename(bed), ".eplv", "_utrs"))
        vcfFile = os.path.join(outputDirectory, replaceExtension(basename(bed), ".vcf", "_utrs"))

        totalUTRlength = simulator.prepareUTRs(bed, bed12, bed12Fasta, referenceFasta, readLength, polyALength, explv, snpRate, vcfFile)
示例#14
0
def runSnp(tid, referenceFile, minCov, minVarFreq, minQual, inputBAM,
           outputDirectory):
    outputSNP = os.path.join(
        outputDirectory, replaceExtension(basename(inputBAM), ".vcf", "_snp"))
    outputLOG = os.path.join(
        outputDirectory, replaceExtension(basename(inputBAM), ".log", "_snp"))
    snps.SNPs(inputBAM, outputSNP, referenceFile, minVarFreq, minCov, minQual,
              getLogFile(outputLOG), printOnly, verbose, False)
    stepFinished()
示例#15
0
def runReadSeparator(tid, bam, ref, minQual, conversionThreshold, outputDirectory, snpDirectory) :
    outputBAM = os.path.join(outputDirectory, replaceExtension(basename(bam), "", ""))
    outputLOG = os.path.join(outputDirectory, replaceExtension(basename(bam), ".log", "_read_separator"))
    if(snpDirectory != None):
        inputSNP = os.path.join(snpDirectory, replaceExtension(basename(bam), ".vcf", "_snp"))
    else:
        inputSNP = None

    log = getLogFile(outputLOG)

    tcounter.genomewideReadSeparation(ref, inputSNP, bam, minQual, outputBAM, conversionThreshold, log)
    stepFinished()
示例#16
0
def runPositionalRates(tid, bam, ref, minQual, conversionThreshold, coverageCutoff, outputDirectory, snpDirectory) :
    outputBedGraphPrefix = os.path.join(outputDirectory, replaceExtension(basename(bam), "", "_positional_rates"))
    outputLOG = os.path.join(outputDirectory, replaceExtension(basename(bam), ".log", "_positional_rates"))
    if(snpDirectory != None):
        inputSNP = os.path.join(snpDirectory, replaceExtension(basename(bam), ".vcf", "_snp"))
    else:
        inputSNP = None

    log = getLogFile(outputLOG)

    tcounter.genomewideConversionRates(ref, inputSNP, bam, minQual, outputBedGraphPrefix, conversionThreshold, coverageCutoff, log)
    stepFinished()
示例#17
0
def runSam2Bam(tid, bam, threads, outputDirectory):
    inputSAM = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".sam", "_slamdunk_mapped"))
    outputBAM = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".bam", "_slamdunk_mapped"))
    outputLOG = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".log", "_slamdunk_mapped"))
    mapper.sort(inputSAM, outputBAM, getLogFile(outputLOG), threads, False,
                printOnly, verbose)
    stepFinished()
示例#18
0
def runDumpReadInfo(tid, bam, referenceFile, minMQ, outputDirectory, snpDirectory):
    outputCSV = os.path.join(outputDirectory, replaceExtension(basename(bam), ".sdunk", "_readinfo"))
    outputLOG = os.path.join(outputDirectory, replaceExtension(basename(bam), ".log", "_readinfo"))
    if(snpDirectory != None):
        inputSNP = os.path.join(snpDirectory, replaceExtension(basename(bam), ".vcf", "_snp"))
    else:
        inputSNP = None
    log = getLogFile(outputLOG)

    dump.dumpReadInfo(referenceFile, bam, minMQ, outputCSV, inputSNP, log)

    closeLogFile(log)
    stepFinished()
示例#19
0
文件: mapper.py 项目: pforai/slamdunk
def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2" , outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False):

    if(quantseqMapping is True) :
        parameter = "--no-progress"
            
    if(trim5p > 0):
        parameter = parameter + " -5 " + str(trim5p)
    
    if(maxPolyA > -1):
        parameter = parameter + " --max-polya " + str(maxPolyA)
    
    if(endtoendMapping is True):
        parameter = parameter + " -e "
    else:
        parameter = parameter + " -l "

    if(sampleId != None):    
        parameter = parameter + " --rg-id " + str(sampleId)
        if(sampleName != ""):
            parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(sampleTime)
    
    if(topn > 1):
        parameter = parameter + " -n " + str(topn) + " --strata "
        
    if(checkStep([inputReference, inputBAM], [replaceExtension(outputSAM, ".bam")], force)):
        if outputSAM.endswith(".sam"):
            # Output SAM
            run(getBinary("ngm") + " -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)
        else:
            # Output BAM directly
            run(getBinary("ngm") + " -b -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly)        
    else:
        print("Skipped mapping for " + inputBAM, file=log)
示例#20
0
 def turnOver(outputDirectory, bed, minHalflife, maxHalfLife, skipTurnover=False):
     message("Simulating turnover")
     createDir(outputDirectory)
     trunoverBed = os.path.join(outputDirectory, replaceExtension(basename(bed), ".bed", "_utrs"))
     if not skipTurnover:
         simulator.simulateTurnOver(bed, trunoverBed, minHalflife, maxHalfLife)
     else:
         copyfile(bed, trunoverBed)
示例#21
0
def Map(inputBAM,
        inputReference,
        outputSAM,
        log,
        quantseqMapping,
        endtoendMapping,
        threads=1,
        parameter="--no-progress --slam-seq 2",
        outputSuffix="_ngm_slamdunk",
        trim5p=0,
        maxPolyA=-1,
        topn=1,
        sampleId=None,
        sampleName="NA",
        sampleType="NA",
        sampleTime=0,
        printOnly=False,
        verbose=True,
        force=False,
        isPaired=False):
    if quantseqMapping:
        parameter = "--no-progress"

    if trim5p > 0:
        parameter = parameter + " -5 " + str(trim5p)

    if maxPolyA > -1:
        parameter = parameter + " --max-polya " + str(maxPolyA)

    if endtoendMapping:
        parameter = parameter + " -e "
    else:
        parameter = parameter + " -l "

    if sampleId is not None:
        parameter = parameter + " --rg-id " + str(sampleId)
        if sampleName != "":
            parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(
                sampleTime)

    if topn > 1:
        parameter = parameter + " -n " + str(topn) + " --strata "

    files = [inputReference]
    files.append(inputBAM) if not isPaired else files.extend(inputBAM)
    files = [os.path.expanduser(p) for p in files]
    if checkStep(files, [replaceExtension(outputSAM, ".bam")], force):
        cmd = "ngm %s -r %s %s -t %s %s -o %s" % (
            "" if outputSAM.endswith(".sam") else "-b", files[0],
            "-q %s" % files[1] if not isPaired else "-1 %s -2 %s" %
            (files[1], files[2]), threads, parameter, outputSAM)
        run(cmd, log, verbose=verbose, dry=printOnly)
    else:
        print("Skipped mapping for " +
              inputBAM if not isPaired else inputBAM[0],
              file=log)
示例#22
0
def processFilter(bam, mq, identity, nm, bed, paired, outputDirectory, n):
    dunkPath = os.path.join(outputDirectory, "filter")
    createDir(dunkPath)
    message("Running slamDunk filter for %s files (%s threads)" %
            (len(bam), n))
    _ = Parallel(n_jobs=n, verbose=verbose)(
        delayed(runFilter)(bam[tid], bed, mq, identity, nm, paired, dunkPath)
        for tid in range(0, len(bam)))
    dunkFinished()
    return dunkPath, [
        os.path.join(dunkPath,
                     replaceExtension(basename(b), ".bam", "_filtered"))
        for b in bam
    ]
示例#23
0
def computeTconversions(ref,
                        bed,
                        snpsFile,
                        bam,
                        maxReadLength,
                        minQual,
                        outputCSV,
                        outputBedgraphPlus,
                        outputBedgraphMinus,
                        conversionThreshold,
                        log,
                        mle=False):

    referenceFile = pysam.FastaFile(ref)

    sampleInfo = getSampleInfo(bam)

    slamseqInfo = SlamSeqInfo(bam)
    #readNumber = slamseqInfo.MappedReads
    readNumber = slamseqInfo.FilteredReads

    bedMD5 = md5(bed)

    if (mle):
        fileNameTest = replaceExtension(outputCSV, ".tsv", "_perread")
        fileTest = open(fileNameTest, 'w')
        print("#slamdunk v" + __version__,
              __count_version__,
              "sample info:",
              sampleInfo.Name,
              sampleInfo.ID,
              sampleInfo.Type,
              sampleInfo.Time,
              sep="\t",
              file=fileTest)
        print("#annotation:",
              os.path.basename(bed),
              bedMD5,
              sep="\t",
              file=fileTest)
        #print("utr", "n", "k", file=fileTest)
        print(SlamSeqInterval.Header, file=fileTest)

    fileCSV = open(outputCSV, 'w')
    print("#slamdunk v" + __version__,
          __count_version__,
          "sample info:",
          sampleInfo.Name,
          sampleInfo.ID,
          sampleInfo.Type,
          sampleInfo.Time,
          sep="\t",
          file=fileCSV)
    print("#annotation:",
          os.path.basename(bed),
          bedMD5,
          sep="\t",
          file=fileCSV)
    print(SlamSeqInterval.Header, file=fileCSV)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    #Go through one chr after the other
    testFile = SlamSeqBamFile(bam, ref, snps)
    if not testFile.bamVersion == __bam_version__:
        raise RuntimeError("Wrong filtered BAM file version detected (" +
                           testFile.bamVersion + "). Expected version " +
                           __bam_version__ + ". Please rerun slamdunk filter.")

    bedMD5 = md5(bed)
    if slamseqInfo.AnnotationMD5 != bedMD5:
        print(
            "Warning: MD5 checksum of annotation (" + bedMD5 +
            ") does not matched MD5 in filtered BAM files (" +
            slamseqInfo.AnnotationMD5 +
            "). Most probably the annotation filed changed after the filtered BAM files were created.",
            file=log)

    conversionBedGraph = {}

    for utr in BedIterator(bed):
        Tcontent = 0
        slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                     utr.strand, utr.name, Tcontent, 0, 0, 0,
                                     0, 0, 0, 0)
        slamSeqUtrMLE = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                        utr.strand, utr.name, Tcontent, 0, 0,
                                        0, 0, 0, 0, 0)
        if (not utr.hasStrand()):
            raise RuntimeError(
                "Input BED file does not contain stranded intervals.")

        if utr.start < 0:
            raise RuntimeError(
                "Negativ start coordinate found. Please check the following entry in your BED file: "
                + utr)
        # Retreive reference sequence
        region = utr.chromosome + ":" + str(utr.start + 1) + "-" + str(
            utr.stop)

        if (utr.chromosome in list(referenceFile.references)):
            #print(refRegion,file=sys.stderr)
            # pysam-0.15.0.1
            #refSeq = referenceFile.fetch(region=region).upper()
            refSeq = referenceFile.fetch(reference=utr.chromosome,
                                         start=utr.start,
                                         end=utr.stop).upper()
            if (utr.strand == "-"):
                #refSeq = complement(refSeq[::-1])
                Tcontent = refSeq.count("A")
            else:
                Tcontent = refSeq.count("T")

            slamSeqUtr._Tcontent = Tcontent

        readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                             utr.stop, utr.strand,
                                             maxReadLength, minQual,
                                             conversionThreshold)

        tcCountUtr = [0] * utr.getLength()
        coverageUtr = [0] * utr.getLength()

        tInReads = []
        tcInRead = []

        countFwd = 0
        tcCountFwd = 0
        countRev = 0
        tCountRev = 0

        multiMapFwd = 0
        multiMapRev = 0

        for read in readIterator:

            # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
            if (not read.isTcRead):
                read.tcCount = 0
                read.mismatches = []
                read.conversionRates = 0.0
                read.tcRate = 0.0

            if (read.direction == ReadDirection.Reverse):
                countRev += 1
                if read.tcCount > 0:
                    tCountRev += 1
                if read.isMultimapper:
                    multiMapRev += 1
            else:
                countFwd += 1
                if read.tcCount > 0:
                    tcCountFwd += 1
                if read.isMultimapper:
                    multiMapFwd += 1

            for mismatch in read.mismatches:
                if (mismatch.isTCMismatch(
                        read.direction == ReadDirection.Reverse)
                        and mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    tcCountUtr[mismatch.referencePosition] += 1

            testN = read.getTcount()
            testk = 0
            for mismatch in read.mismatches:
                if (mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    if (mismatch.isT(read.direction == ReadDirection.Reverse)):
                        testN += 1
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)):
                        testk += 1
            #print(utr.name, read.name, read.direction, testN, testk, read.sequence, sep="\t")
            tInReads.append(testN)
            tcInRead.append(testk)
            #print(utr.name, testN, testk, sep="\t", file=fileTest)

            for i in xrange(read.startRefPos, read.endRefPos):
                if (i >= 0 and i < utr.getLength()):
                    coverageUtr[i] += 1

        if ((utr.strand == "+" and countFwd > 0)
                or (utr.strand == "-" and countRev > 0)):
            tcRateUtr = [
                x * 100.0 / y if y > 0 else 0
                for x, y in zip(tcCountUtr, coverageUtr)
            ]

            readCount = countFwd
            tcReadCount = tcCountFwd
            multiMapCount = multiMapFwd

            if (utr.strand == "-"):
                readCount = countRev
                tcReadCount = tCountRev
                multiMapCount = multiMapRev

            if ((utr.strand == "-" and countFwd > countRev)
                    or (utr.strand == "+" and countRev > countFwd)):
                print(
                    "Warning: " + utr.name + " is located on the " +
                    utr.strand +
                    " strand but read counts are higher for the opposite strand (fwd: "
                    + countFwd + ", rev: " + countRev + ")",
                    file=sys.stderr)

            refSeq = readIterator.getRefSeq()

            # Get number of covered Ts/As in the UTR and compute average conversion rate for all covered Ts/As
            coveredTcount = 0
            avgConversationRate = 0
            coveredPositions = 0
            # Get number of reads on T positions and number of reads with T->C conversions on T positions
            coverageOnTs = 0
            conversionsOnTs = 0

            for position in xrange(0, len(coverageUtr)):

                if (coverageUtr[position] > 0
                        and ((utr.strand == "+" and refSeq[position] == "T") or
                             (utr.strand == "-" and refSeq[position] == "A"))):
                    coveredTcount += 1
                    avgConversationRate += tcRateUtr[position]

                    coverageOnTs += coverageUtr[position]
                    conversionsOnTs += tcCountUtr[position]
                    conversionBedGraph[utr.chromosome + ":" +
                                       str(utr.start + position) + ":" +
                                       str(utr.strand)] = tcRateUtr[position]
                if (coverageUtr[position] > 0):
                    coveredPositions += 1

            if (coveredTcount > 0):
                avgConversationRate = avgConversationRate / coveredTcount
            else:
                avgConversationRate = 0

            # reads per million mapped to the UTR
            readsCPM = 0
            if (readNumber > 0):
                readsCPM = readCount * 1000000.0 / readNumber

            # Convert to SlamSeqInterval and print
            conversionRate = 0
            if (coverageOnTs > 0):
                conversionRate = float(conversionsOnTs) / float(coverageOnTs)
            slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                         utr.strand, utr.name, Tcontent,
                                         readsCPM, coverageOnTs,
                                         conversionsOnTs, conversionRate,
                                         readCount, tcReadCount, multiMapCount)
            slamSeqUtrMLE = SlamSeqInterval(
                utr.chromosome, utr.start, utr.stop, utr.strand, utr.name,
                Tcontent, readsCPM, coverageOnTs, conversionsOnTs,
                conversionRate, ",".join(str(x) for x in tInReads),
                ",".join(str(x) for x in tcInRead), multiMapCount)

        print(slamSeqUtr, file=fileCSV)
        if (mle):
            print(slamSeqUtrMLE, file=fileTest)

    fileCSV.close()
    if (mle):
        fileTest.close()

    fileBedgraphPlus = open(outputBedgraphPlus, 'w')
    fileBedgraphMinus = open(outputBedgraphMinus, 'w')

    for position in conversionBedGraph:
        positionData = position.split(":")
        if (positionData[2] == "+"):
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphPlus)
        else:
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphMinus)

    fileBedgraphPlus.close()
    fileBedgraphMinus.close()

    if (mle):
        fileNameMLE = replaceExtension(outputCSV, ".tsv", "_mle")
        callR(
            getPlotter("compute_conversion_rate_mle") + " -f " + fileNameTest +
            " -r " + "0.024" + " -o " + fileNameMLE + " &> /dev/null")
示例#24
0
def readSummary(filteredFiles,
                countDirectory,
                outputFile,
                log,
                printOnly=False,
                verbose=True,
                force=False):

    # Print sort by ID
    contentDict = {}

    tsvFile = open(outputFile, "w")

    print("# slamdunk summary v" + __version__, file=tsvFile)

    if (countDirectory != None):
        f = tempfile.NamedTemporaryFile(delete=False)

    for bam in filteredFiles:
        slamseqInfo = SlamSeqInfo(bam)
        sampleInfo = getSampleInfo(bam)

        if (countDirectory != None):

            countedReads = 0

            countFile = os.path.join(
                countDirectory,
                replaceExtension(os.path.basename(bam), ".tsv", "_tcount"))
            if not os.path.exists(countFile):
                print(
                    "TCount directory does not seem to contain tcount file for:\t"
                    + countFile)
            else:
                print(sampleInfo.Name, countFile, sep='\t', file=f)
                countedReads = sumCounts(countFile)

            if (sampleInfo.ID in contentDict):
                ID = len(contentDict) + 1
            else:
                ID = sampleInfo.ID

            contentDict[int(ID)] = "\t".join([
                bam, sampleInfo.Name, sampleInfo.Type, sampleInfo.Time,
                str(slamseqInfo.SequencedReads),
                str(slamseqInfo.MappedReads),
                str(slamseqInfo.DedupReads),
                str(slamseqInfo.MQFilteredReads),
                str(slamseqInfo.IdFilteredReads),
                str(slamseqInfo.NmFilteredReads),
                str(slamseqInfo.MultimapperReads),
                str(slamseqInfo.FilteredReads),
                str(countedReads), slamseqInfo.AnnotationName
            ])

        else:

            if (sampleInfo.ID in contentDict):
                ID = len(contentDict) + 1
            else:
                ID = sampleInfo.ID

            contentDict[int(ID)] = "\t".join([
                bam, sampleInfo.Name, sampleInfo.Type, sampleInfo.Time,
                str(slamseqInfo.SequencedReads),
                str(slamseqInfo.MappedReads),
                str(slamseqInfo.DedupReads),
                str(slamseqInfo.MQFilteredReads),
                str(slamseqInfo.IdFilteredReads),
                str(slamseqInfo.NmFilteredReads),
                str(slamseqInfo.MultimapperReads),
                str(slamseqInfo.FilteredReads), slamseqInfo.AnnotationName
            ])

    if (countDirectory != None):

        f.close()

        callR(getPlotter("PCAPlotter") + " -f " + f.name + " -O " +
              replaceExtension(outputFile, ".pdf", "_PCA") + " -P " +
              replaceExtension(outputFile, ".txt", "_PCA"),
              log,
              dry=printOnly,
              verbose=verbose)

        print("FileName",
              "SampleName",
              "SampleType",
              "SampleTime",
              "Sequenced",
              "Mapped",
              "Deduplicated",
              "MQ-Filtered",
              "Identity-Filtered",
              "NM-Filtered",
              "Multimap-Filtered",
              "Retained",
              "Counted",
              "Annotation",
              sep="\t",
              file=tsvFile)

    else:
        print("FileName",
              "SampleName",
              "SampleType",
              "SampleTime",
              "Sequenced",
              "Mapped",
              "Deduplicated",
              "MQ-Filtered",
              "Identity-Filtered",
              "NM-Filtered",
              "Multimap-Filtered",
              "Retained",
              "Annotation",
              sep="\t",
              file=tsvFile)

    for key in sorted(contentDict):
        print(contentDict[key], file=tsvFile)

    tsvFile.close()
示例#25
0
def runCount(
    bam,
    ref,
    bed,
    maxLength,
    minQual,
    conversionThreshold,
    is_inverse,
    outputDirectory,
    snpDirectory,
    vcfFile,
):
    outputCSV = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".tsv", "_tcount"))
    outputBedgraphPlus = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".bedgraph", "_tcount_plus"))
    outputBedgraphMinus = os.path.join(
        outputDirectory,
        replaceExtension(basename(bam), ".bedgraph", "_tcount_mins"))
    outputLOG = os.path.join(
        outputDirectory, replaceExtension(basename(bam), ".log", "_tcount"))

    if vcfFile is not None:
        inputSNP = vcfFile
    elif snpDirectory is not None:
        inputSNP = os.path.join(
            snpDirectory, replaceExtension(basename(bam), ".vcf", "_snp"))
    else:
        inputSNP = None

    if maxLength is None:
        maxLength = estimateMaxReadLength(bam)
    if maxLength < 0:
        print("Difference between minimum and maximum read length is > 10. "
              "Please specify --max-read-length parameter.")
        sys.exit(0)

    log = getLogFile(outputLOG)
    print("Using " + str(maxLength) + " as maximum read length.", file=log)
    if bed is not None:
        message("Bed file detected.")
        tcounter.computeTconversions(ref, bed, inputSNP, bam, maxLength,
                                     minQual, outputCSV, outputBedgraphPlus,
                                     outputBedgraphMinus, conversionThreshold,
                                     log)
    else:
        message("No bed file passed. Count w.r.t. the full genome.")
        outputBedgraphPlusNew = os.path.join(
            outputDirectory,
            replaceExtension(basename(bam), ".bedgraph", "_tcount_plus_new"))
        outputBedgraphMinusNew = os.path.join(
            outputDirectory,
            replaceExtension(basename(bam), ".bedgraph", "_tcount_mins_new"))
        tcounter.computeTconversionsAll(ref, inputSNP, bam, outputBedgraphPlus,
                                        outputBedgraphPlusNew,
                                        outputBedgraphMinus,
                                        outputBedgraphMinusNew,
                                        conversionThreshold, minQual,
                                        is_inverse, log)
    stepFinished()
    return outputCSV
示例#26
0
def runAll(args):
    message("slamdunk all")

    if args.sampleIndex > -1:
        sec = random.randrange(200, 2000) / 1000.0
        message("Waiting " + str(sec) + " seconds")
        sleep(sec)

    # Setup slamdunk run folder

    outputDirectory = args.outputDir
    createDir(outputDirectory)

    n = args.threads
    referenceFile = args.referenceFile

    # Run mapper dunk

    dunkPath = os.path.join(outputDirectory, "map")
    createDir(dunkPath)

    samples, samplesInfos = getSamples(args.files, runOnly=args.sampleIndex)

    message("Running slamDunk map for " + str(len(samples)) + " files (" +
            str(n) + " threads)")

    for i in range(0, len(samples)):
        bam = samples[i]
        sampleInfo = samplesInfos[i]
        tid = i
        if args.sampleIndex > -1:
            tid = args.sampleIndex
        runMap(tid, bam, referenceFile, n, args.trim5, args.maxPolyA,
               args.quantseq, args.endtoend, args.topn, sampleInfo, dunkPath,
               args.skipSAM)

    dunkFinished()

    if (not args.skipSAM):
        message("Running slamDunk sam2bam for " + str(len(samples)) +
                " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=1, verbose=verbose)(
            delayed(runSam2Bam)(tid, samples[tid], n, dunkPath)
            for tid in range(0, len(samples)))
        dunkFinished()

    dunkbufferIn = []

    for file in samples:
        dunkbufferIn.append(
            os.path.join(
                dunkPath,
                replaceExtension(basename(file), ".bam", "_slamdunk_mapped")))

    # Run filter dunk

    bed = args.bed

    if args.filterbed:
        bed = args.filterbed
        args.multimap = True

    if (not args.multimap):
        bed = None

    dunkPath = os.path.join(outputDirectory, "filter")
    createDir(dunkPath)

    message("Running slamDunk filter for " + str(len(samples)) + " files (" +
            str(n) + " threads)")
    results = Parallel(n_jobs=n, verbose=verbose)(
        delayed(runFilter)(tid, dunkbufferIn[tid], bed, args.mq, args.identity,
                           args.nm, dunkPath)
        for tid in range(0, len(samples)))

    dunkFinished()

    # Run filter dunk

    dunkbufferOut = []

    for file in dunkbufferIn:
        dunkbufferOut.append(
            os.path.join(dunkPath,
                         replaceExtension(basename(file), ".bam",
                                          "_filtered")))

    dunkbufferIn = dunkbufferOut

    dunkbufferOut = []

    dunkFinished()

    # Run snps dunk

    dunkPath = os.path.join(outputDirectory, "snp")
    createDir(dunkPath)

    minCov = args.cov
    minVarFreq = args.var

    snpThread = n
    if (snpThread > 1):
        snpThread = int(snpThread / 2)

    #if (args.minQual == 0) :
    #    snpqual = 13
    #else :
    snpqual = args.minQual

    message("Running slamDunk SNP for " + str(len(samples)) + " files (" +
            str(snpThread) + " threads)")
    results = Parallel(n_jobs=snpThread, verbose=verbose)(
        delayed(runSnp)(tid, referenceFile, minCov, minVarFreq, snpqual,
                        dunkbufferIn[tid], dunkPath)
        for tid in range(0, len(samples)))

    dunkFinished()

    # Run count dunk

    dunkPath = os.path.join(outputDirectory, "count")
    createDir(dunkPath)

    snpDirectory = os.path.join(outputDirectory, "snp")

    message("Running slamDunk tcount for " + str(len(samples)) + " files (" +
            str(n) + " threads)")
    results = Parallel(n_jobs=n, verbose=verbose)(
        delayed(runCount)(tid, dunkbufferIn[tid], referenceFile, args.bed,
                          args.maxLength, args.minQual,
                          args.conversionThreshold, dunkPath, snpDirectory)
        for tid in range(0, len(samples)))

    dunkFinished()
示例#27
0
    def prepareBed(outputDirectory, bed, readLength):

        createDir(outputDirectory)
        slamSimBed = os.path.join(outputDirectory, replaceExtension(basename(bed), ".bed", "_original"))
        simulator.prepareBED(bed, slamSimBed, readLength)
示例#28
0
def computeTconversionsAll(
        ref,
        snpsFile,
        bam,
        outputBedgraphPlus,
        outputBedgraphPlusNew,
        outputBedgraphMinus,
        outputBedgraphMinusNew,
        conversionThreshold,
        minQual,
        is_inverse,
        log,
):
    def to_bed_graph(c, data, bedgraph, rn):
        data /= rn
        data *= 1000000.0
        [print(c, i, i+1, d, file=bedgraph) for i, d in enumerate(data)]

    chroms_fw = {
        'chrI': np.zeros(230218).astype('float32'),
        'chrII': np.zeros(813184).astype('float32'),
        'chrIII': np.zeros(316620).astype('float32'),
        'chrIV': np.zeros(1531933).astype('float32'),
        'chrIX': np.zeros(439888).astype('float32'),
        'chrM': np.zeros(85779).astype('float32'),
        'chrV': np.zeros(576874).astype('float32'),
        'chrVI': np.zeros(270161).astype('float32'),
        'chrVII': np.zeros(1090940).astype('float32'),
        'chrVIII': np.zeros(562643).astype('float32'),
        'chrX': np.zeros(745751).astype('float32'),
        'chrXI': np.zeros(666816).astype('float32'),
        'chrXII': np.zeros(1078177).astype('float32'),
        'chrXIII': np.zeros(924431).astype('float32'),
        'chrXIV': np.zeros(784333).astype('float32'),
        'chrXV': np.zeros(1091291).astype('float32'),
        'chrXVI': np.zeros(948066).astype('float32')
    }
    chroms_bw = copy.deepcopy(chroms_fw)
    chroms_fw_new = copy.deepcopy(chroms_fw.copy())
    chroms_bw_new = copy.deepcopy(chroms_fw.copy())
    readNumber, positiveCount, negativeCount, positiveCountNew, negativeCountNew = 0, 0, 0, 0, 0
    bamFile = pysam.AlignmentFile(bam, "rb")
    if bamFile.header['HD']['SO'] != 'queryname':
        # Sort bam file
        sbam = replaceExtension(bam, '.bam', '_sorted')
        if not os.path.exists(sbam):
            run(
                'samtools sort -n %s -o %s' % (bam, sbam),
                log
            )
    else:
        sbam = bam

    bamFile = pysam.AlignmentFile(sbam, "rb")
    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    seqIter = SlamSeqIter(bamFile, ref, snps, conversionThreshold, minQual)
    read1 = None
    read2 = None
    for read in seqIter:
        if not read.isPaired or read.unmappedMate or read.duplicate:
            continue
        if read.isSecondRead:
            read2 = read
        else:
            read1 = read
            read2 = None
            continue
        if read1 is None or read2 is None or read1.queryName != read2.queryName:
            continue
        readNumber += 1
        chrom = read1.chromosome
        start = np.minimum(read1.startRefPos, read2.startRefPos)
        end = np.maximum(read2.endRefPos, read2.endRefPos)
        is_tc_read = read1.isTcRead or read2.isTcRead
        direction_read = read1 if not is_inverse else read2
        if direction_read.direction == ReadDirection.Forward:
            positiveCount += 1
            chroms_fw[chrom][start:end] += 1
            if is_tc_read:
                positiveCountNew += 1
                chroms_fw_new[chrom][start:end] += 1
        else:
            negativeCount += 1
            chroms_bw[chrom][start:end] += 1
            if is_tc_read:
                negativeCountNew += 1
                chroms_bw_new[chrom][start:end] += 1

    print("Total reads: %s\n"
          "Positive reads: %s\n"
          "Positive reads new: %s\n"
          "Negative reads: %s\n"
          "Negative reads new: %s" %
          (readNumber, positiveCount, positiveCountNew, negativeCount, negativeCountNew),
          file=log)
    fileBedgraphPlus = open(outputBedgraphPlus, 'w')
    fileBedgraphPlusNew = open(outputBedgraphPlusNew, 'w')
    fileBedgraphMinus = open(outputBedgraphMinus, 'w')
    fileBedgraphMinusNew = open(outputBedgraphMinusNew, 'w')
    for chrom in chroms_fw.keys():
        to_bed_graph(chrom, chroms_fw[chrom], fileBedgraphPlus, readNumber)
        to_bed_graph(chrom, chroms_bw[chrom], fileBedgraphMinus, readNumber)
        to_bed_graph(chrom, chroms_fw_new[chrom], fileBedgraphPlusNew, readNumber)
        to_bed_graph(chrom, chroms_bw_new[chrom], fileBedgraphMinusNew, readNumber)

    fileBedgraphPlus.close()
    fileBedgraphPlusNew.close()
    fileBedgraphMinus.close()
    fileBedgraphMinusNew.close()
示例#29
0
def run():
    ########################################################################
    # Argument parsing
    ########################################################################

    # Info
    usage = "SLAMdunk software for analyzing SLAM-seq data"

    # Main Parsers
    parser = ArgumentParser(description=usage,
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    # Initialize Subparsers
    subparsers = parser.add_subparsers(help="", dest="command")

    # map command

    mapparser = subparsers.add_parser(
        'map',
        help='Map SLAM-seq read data',
        formatter_class=ArgumentDefaultsHelpFormatter)
    mapparser.add_argument(
        'files',
        action='store',
        help=
        'Single csv/tsv file (recommended) containing all sample files and sample info or a list of all sample BAM/FASTA(gz)/FASTQ(gz) files',
        nargs="+")
    mapparser.add_argument("-r",
                           "--reference",
                           type=str,
                           required=True,
                           dest="referenceFile",
                           default=SUPPRESS,
                           help="Reference fasta file")
    mapparser.add_argument("-o",
                           "--outputDir",
                           type=str,
                           required=True,
                           dest="outputDir",
                           default=SUPPRESS,
                           help="Output directory for mapped BAM files.")
    mapparser.add_argument(
        "-5",
        "--trim-5p",
        type=int,
        required=False,
        dest="trim5",
        default=12,
        help="Number of bp removed from 5' end of all reads.")
    mapparser.add_argument("-n",
                           "--topn",
                           type=int,
                           required=False,
                           dest="topn",
                           default=1,
                           help="Max. number of alignments to report per read")
    mapparser.add_argument("-a",
                           "--max-polya",
                           type=int,
                           required=False,
                           dest="maxPolyA",
                           default=4,
                           help="Max number of As at the 3' end of a read.")
    mapparser.add_argument("-t",
                           "--threads",
                           type=int,
                           required=False,
                           dest="threads",
                           default=1,
                           help="Thread number")
    mapparser.add_argument(
        "-q",
        "--quantseq",
        dest="quantseq",
        action='store_true',
        required=False,
        help="Run plain Quantseq alignment without SLAM-seq scoring")
    mapparser.add_argument(
        '-e',
        "--endtoend",
        action='store_true',
        dest="endtoend",
        help="Use a end to end alignment algorithm for mapping.")
    mapparser.add_argument(
        '-sn',
        "--sampleName",
        type=str,
        dest="sampleName",
        required=False,
        help="Use this sample name for all supplied samples")
    mapparser.add_argument(
        '-sy',
        "--sampleType",
        type=str,
        dest="sampleType",
        required=False,
        default="pulse",
        help="Use this sample type for all supplied samples")
    mapparser.add_argument(
        '-st',
        "--sampleTime",
        type=int,
        dest="sampleTime",
        required=False,
        default=0,
        help="Use this sample time for all supplied samples")
    mapparser.add_argument(
        "-i",
        "--sample-index",
        type=int,
        required=False,
        default=-1,
        dest="sampleIndex",
        help=
        "Run analysis only for sample <i>. Use for distributing slamdunk analysis on a cluster (index is 1-based)."
    )
    mapparser.add_argument(
        '-ss',
        "--skip-sam",
        action='store_true',
        dest="skipSAM",
        help="Output BAM while mapping. Slower but, uses less hard disk.")

    # filter command

    filterparser = subparsers.add_parser('filter',
                                         help='Filter SLAM-seq aligned data')
    filterparser.add_argument('bam',
                              action='store',
                              help='Bam file(s)',
                              nargs="+")
    filterparser.add_argument("-o",
                              "--outputDir",
                              type=str,
                              required=True,
                              dest="outputDir",
                              help="Output directory for mapped BAM files.")
    filterparser.add_argument("-b",
                              "--bed",
                              type=str,
                              required=False,
                              dest="bed",
                              help="BED file, overrides MQ filter to 0")
    filterparser.add_argument(
        "-mq",
        "--min-mq",
        type=int,
        required=False,
        default=2,
        dest="mq",
        help="Minimum mapping quality (default: %(default)d)")
    filterparser.add_argument(
        "-mi",
        "--min-identity",
        type=float,
        required=False,
        default=0.95,
        dest="identity",
        help="Minimum alignment identity (default: %(default)s)")
    filterparser.add_argument(
        "-nm",
        "--max-nm",
        type=int,
        required=False,
        default=-1,
        dest="nm",
        help="Maximum NM for alignments (default: %(default)d)")
    filterparser.add_argument("-t",
                              "--threads",
                              type=int,
                              required=False,
                              dest="threads",
                              default=1,
                              help="Thread number (default: %(default)d)")

    # snp command

    snpparser = subparsers.add_parser(
        'snp',
        help='Call SNPs on SLAM-seq aligned data',
        formatter_class=ArgumentDefaultsHelpFormatter)
    snpparser.add_argument('bam',
                           action='store',
                           help='Bam file(s)',
                           nargs="+")
    snpparser.add_argument("-o",
                           "--outputDir",
                           type=str,
                           required=True,
                           dest="outputDir",
                           default=SUPPRESS,
                           help="Output directory for mapped BAM files.")
    snpparser.add_argument("-r",
                           "--reference",
                           required=True,
                           dest="fasta",
                           type=str,
                           default=SUPPRESS,
                           help="Reference fasta file")
    snpparser.add_argument("-c",
                           "--min-coverage",
                           required=False,
                           dest="cov",
                           type=int,
                           help="Minimimum coverage to call variant",
                           default=10)
    #snpparser.add_argument("-q", "--min-base-qual", type=int, default=13, required=False, dest="minQual", help="Min base quality for T -> C conversions (default: %(default)d)")
    snpparser.add_argument("-f",
                           "--var-fraction",
                           required=False,
                           dest="var",
                           type=float,
                           help="Minimimum variant fraction to call variant",
                           default=0.8)
    snpparser.add_argument("-t",
                           "--threads",
                           type=int,
                           required=False,
                           default=1,
                           dest="threads",
                           help="Thread number")

    # count command

    countparser = subparsers.add_parser(
        'count', help='Count T/C conversions in SLAM-seq aligned data')
    countparser.add_argument('bam',
                             action='store',
                             help='Bam file(s)',
                             nargs="+")
    countparser.add_argument("-o",
                             "--outputDir",
                             type=str,
                             required=True,
                             dest="outputDir",
                             default=SUPPRESS,
                             help="Output directory for mapped BAM files.")
    countparser.add_argument("-s",
                             "--snp-directory",
                             type=str,
                             required=False,
                             dest="snpDir",
                             default=SUPPRESS,
                             help="Directory containing SNP files.")
    countparser.add_argument("-v",
                             "--vcf",
                             type=str,
                             required=False,
                             dest="vcfFile",
                             default=SUPPRESS,
                             help="Externally provided custom variant file.")
    countparser.add_argument("-r",
                             "--reference",
                             type=str,
                             required=True,
                             dest="ref",
                             default=SUPPRESS,
                             help="Reference fasta file")
    countparser.add_argument("-b",
                             "--bed",
                             type=str,
                             required=True,
                             dest="bed",
                             default=SUPPRESS,
                             help="BED file")
    countparser.add_argument(
        "-c",
        "--conversion-threshold",
        type=int,
        dest="conversionThreshold",
        required=False,
        default=1,
        help=
        "Number of T>C conversions required to count read as T>C read (default: %(default)d)"
    )
    countparser.add_argument("-l",
                             "--max-read-length",
                             type=int,
                             required=False,
                             dest="maxLength",
                             help="Max read length in BAM file")
    countparser.add_argument(
        "-q",
        "--min-base-qual",
        type=int,
        default=27,
        required=False,
        dest="minQual",
        help="Min base quality for T -> C conversions (default: %(default)d)")
    countparser.add_argument("-t",
                             "--threads",
                             type=int,
                             required=False,
                             default=1,
                             dest="threads",
                             help="Thread number (default: %(default)d)")

    # all command

    allparser = subparsers.add_parser('all',
                                      help='Run entire SLAMdunk analysis')
    allparser.add_argument(
        'files',
        action='store',
        help=
        'Single csv/tsv file (recommended) containing all sample files and sample info or a list of all sample BAM/FASTA(gz)/FASTQ(gz) files',
        nargs="+")
    allparser.add_argument("-r",
                           "--reference",
                           type=str,
                           required=True,
                           dest="referenceFile",
                           help="Reference fasta file")
    allparser.add_argument("-b",
                           "--bed",
                           type=str,
                           required=True,
                           dest="bed",
                           help="BED file with 3'UTR coordinates")
    allparser.add_argument(
        "-fb",
        "--filterbed",
        type=str,
        required=False,
        dest="filterbed",
        help=
        "BED file with 3'UTR coordinates to filter multimappers (activates -m)"
    )
    allparser.add_argument(
        "-v",
        "--vcf",
        type=str,
        required=False,
        dest="vcfFile",
        default=SUPPRESS,
        help="Skip SNP step and provide custom variant file.")
    allparser.add_argument("-o",
                           "--outputDir",
                           type=str,
                           required=True,
                           dest="outputDir",
                           help="Output directory for slamdunk run.")
    allparser.add_argument(
        "-5",
        "--trim-5p",
        type=int,
        required=False,
        dest="trim5",
        default=12,
        help=
        "Number of bp removed from 5' end of all reads (default: %(default)s)")
    allparser.add_argument(
        "-a",
        "--max-polya",
        type=int,
        required=False,
        dest="maxPolyA",
        default=4,
        help="Max number of As at the 3' end of a read (default: %(default)s)")
    allparser.add_argument(
        "-n",
        "--topn",
        type=int,
        required=False,
        dest="topn",
        default=1,
        help=
        "Max. number of alignments to report per read (default: %(default)s)")
    allparser.add_argument("-t",
                           "--threads",
                           type=int,
                           required=False,
                           default=1,
                           dest="threads",
                           help="Thread number (default: %(default)s)")
    allparser.add_argument(
        "-q",
        "--quantseq",
        dest="quantseq",
        action='store_true',
        required=False,
        help="Run plain Quantseq alignment without SLAM-seq scoring")
    allparser.add_argument(
        '-e',
        "--endtoend",
        action='store_true',
        dest="endtoend",
        help="Use a end to end alignment algorithm for mapping.")
    allparser.add_argument(
        '-m',
        "--multimap",
        action='store_true',
        dest="multimap",
        help="Use reference to resolve multimappers (requires -n > 1).")
    allparser.add_argument(
        "-mq",
        "--min-mq",
        type=int,
        required=False,
        default=2,
        dest="mq",
        help="Minimum mapping quality (default: %(default)s)")
    allparser.add_argument(
        "-mi",
        "--min-identity",
        type=float,
        required=False,
        default=0.95,
        dest="identity",
        help="Minimum alignment identity (default: %(default)s)")
    allparser.add_argument(
        "-nm",
        "--max-nm",
        type=int,
        required=False,
        default=-1,
        dest="nm",
        help="Maximum NM for alignments (default: %(default)s)")
    allparser.add_argument(
        "-mc",
        "--min-coverage",
        required=False,
        dest="cov",
        type=int,
        help="Minimimum coverage to call variant (default: %(default)s)",
        default=10)
    allparser.add_argument(
        "-mv",
        "--var-fraction",
        required=False,
        dest="var",
        type=float,
        help=
        "Minimimum variant fraction to call variant (default: %(default)s)",
        default=0.8)
    allparser.add_argument(
        "-c",
        "--conversion-threshold",
        type=int,
        dest="conversionThreshold",
        required=False,
        default=1,
        help=
        "Number of T>C conversions required to count read as T>C read (default: %(default)d)"
    )
    allparser.add_argument("-rl",
                           "--max-read-length",
                           type=int,
                           required=False,
                           dest="maxLength",
                           help="Max read length in BAM file")
    allparser.add_argument(
        "-mbq",
        "--min-base-qual",
        type=int,
        default=27,
        required=False,
        dest="minQual",
        help="Min base quality for T -> C conversions (default: %(default)d)")
    allparser.add_argument(
        '-sn',
        "--sampleName",
        type=str,
        dest="sampleName",
        required=False,
        help="Use this sample name for all supplied samples")
    allparser.add_argument(
        '-sy',
        "--sampleType",
        type=str,
        dest="sampleType",
        required=False,
        default="pulse",
        help="Use this sample type for all supplied samples")
    allparser.add_argument(
        '-st',
        "--sampleTime",
        type=int,
        dest="sampleTime",
        required=False,
        default=0,
        help="Use this sample time for all supplied samples")
    allparser.add_argument(
        "-i",
        "--sample-index",
        type=int,
        required=False,
        default=-1,
        dest="sampleIndex",
        help=
        "Run analysis only for sample <i>. Use for distributing slamdunk analysis on a cluster (index is 1-based)."
    )
    allparser.add_argument(
        "-ss",
        "--skip-sam",
        action='store_true',
        dest="skipSAM",
        help="Output BAM while mapping. Slower but, uses less hard disk.")

    args = parser.parse_args()

    ########################################################################
    # Routine selection
    ########################################################################

    command = args.command

    if (command == "map"):
        mapper.checkNextGenMapVersion()

        outputDirectory = args.outputDir

        if args.sampleIndex > -1:
            sec = random.randrange(0, 2000) / 1000
            message("Waiting " + str(sec) + " seconds")
            sleep(sec)

        createDir(outputDirectory)
        n = args.threads
        referenceFile = args.referenceFile

        samples, samplesInfos = getSamples(args.files,
                                           runOnly=args.sampleIndex)

        message("Running slamDunk map for " + str(len(samples)) + " files (" +
                str(n) + " threads)")
        for i in range(0, len(samples)):
            bam = samples[i]

            if not args.sampleName or len(samples) > 1:
                sampleName = replaceExtension(basename(bam), "", "")
            else:
                sampleName = args.sampleName

            sampleInfo = samplesInfos[i]
            if sampleInfo == "":
                sampleInfo = sampleName + ":" + args.sampleType + ":" + str(
                    args.sampleTime)
            tid = i
            if args.sampleIndex > -1:
                tid = args.sampleIndex
            runMap(tid, bam, referenceFile, n, args.trim5, args.maxPolyA,
                   args.quantseq, args.endtoend, args.topn, sampleInfo,
                   outputDirectory, args.skipSAM)

        dunkFinished()

        if not args.skipSAM:
            message("Running slamDunk sam2bam for " + str(len(samples)) +
                    " files (" + str(n) + " threads)")
            results = Parallel(n_jobs=1, verbose=verbose)(
                delayed(runSam2Bam)(tid, samples[tid], n, outputDirectory)
                for tid in range(0, len(samples)))
            dunkFinished()

    elif (command == "filter"):
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        message("Running slamDunk filter for " + str(len(args.bam)) +
                " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(
            delayed(runFilter)(tid, args.bam[tid], args.bed, args.mq,
                               args.identity, args.nm, outputDirectory)
            for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "snp"):
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        fasta = args.fasta
        minCov = args.cov
        minVarFreq = args.var
        #minQual = args.minQual
        minQual = 15
        n = args.threads
        if (n > 1):
            n = int(n / 2)
        message("Running slamDunk SNP for " + str(len(args.bam)) + " files (" +
                str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(
            delayed(runSnp)(tid, fasta, minCov, minVarFreq, minQual,
                            args.bam[tid], outputDirectory)
            for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "count"):
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        if "snpDir" in args:
            snpDirectory = args.snpDir
        else:
            snpDirectory = None
        if "vcfFile" in args:
            vcfFile = args.vcfFile
        else:
            vcfFile = None
        n = args.threads
        message("Running slamDunk tcount for " + str(len(args.bam)) +
                " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runCount)(
            tid, args.bam[tid], args.ref, args.bed, args.maxLength,
            args.minQual, args.conversionThreshold, outputDirectory,
            snpDirectory, vcfFile) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "all"):
        runAll(args)
        dunkFinished()

    else:
        parser.error("Too few arguments.")
示例#30
0
def runSummary(bam, outputFile, countDirectory):

    outputLog = replaceExtension(outputFile, ".log")
    stats.readSummary(bam, countDirectory, outputFile, getLogFile(outputLog))
示例#31
0
def run():

    ########################################################################
    # Argument parsing
    ########################################################################

    # Info
    usage = "AlleyOop utility tools and diagnostics for SLAMSeq data"

    # Main Parsers
    parser = ArgumentParser(description=usage, formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', action='version', version='%(prog)s ' + __version__)

    # Initialize Subparsers
    subparsers = parser.add_subparsers(help="", dest="command")

    # dedup command
    dedupparser = subparsers.add_parser('dedup', help='Deduplicate SLAM-seq aligned data', formatter_class=ArgumentDefaultsHelpFormatter)
    dedupparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", default=SUPPRESS, help="Output directory for mapped BAM files.")
    dedupparser.add_argument("-tc", "--tcMutations", type=int, required=False, default = 0, dest="tcMutations", help="Only select reads with x number of T>C mutations.")
    dedupparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number")
    dedupparser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")

    # collapse command
    collapseparser = subparsers.add_parser('collapse', help='Collapse UTRs', formatter_class=ArgumentDefaultsHelpFormatter)
    collapseparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", default=SUPPRESS, help="Output directory for mapped BAM files.")
    collapseparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number")
    collapseparser.add_argument('tcount', action='store', help='Tcount file(s)' , nargs="+")

    # positional-rates command
    posratesparser = subparsers.add_parser('positional-tracks', help='Genome-wide positional tracks as bedgraph', formatter_class=ArgumentDefaultsHelpFormatter)
    posratesparser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    posratesparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", default=SUPPRESS, help="Output directory for bedGraph files.")
    posratesparser.add_argument("-s", "--snp-directory", type=str, required=False, dest="snpDir", default=SUPPRESS, help="Directory containing SNP files.")
    posratesparser.add_argument("-r", "--reference", type=str, required=True, dest="ref", default=SUPPRESS, help="Reference fasta file")
    posratesparser.add_argument("-c", "--conversion-threshold", type=int, dest="conversionThreshold", required=False, default=1,help="Number of T>C conversions required to count read as T>C read (default: %(default)d)")
    posratesparser.add_argument("-a", "--coverage-cutoff", type=int, dest="coverageCutoff", required=False, default=1,help="Minimum coverage required to report nucleotide-conversion rate (default: %(default)d). Anything less than 1 will be set to 1 to avoid division by zero.")
    posratesparser.add_argument("-q", "--min-base-qual", type=int, default=27, required=False, dest="minQual", help="Min base quality for T -> C conversions (default: %(default)d)")
    posratesparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number (default: %(default)d)")

    # TC read separator
    readseparatorparser = subparsers.add_parser('read-separator', help='Separate TC-reads from background reads genome-wide', formatter_class=ArgumentDefaultsHelpFormatter)
    readseparatorparser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    readseparatorparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", default=SUPPRESS, help="Output directory for bam files.")
    readseparatorparser.add_argument("-s", "--snp-directory", type=str, required=False, dest="snpDir", default=SUPPRESS, help="Directory containing SNP files.")
    readseparatorparser.add_argument("-r", "--reference", type=str, required=True, dest="ref", default=SUPPRESS, help="Reference fasta file")
    readseparatorparser.add_argument("-c", "--conversion-threshold", type=int, dest="conversionThreshold", required=False, default=1,help="Number of T>C conversions required to count read as T>C read (default: %(default)d)")
    readseparatorparser.add_argument("-q", "--min-base-qual", type=int, default=27, required=False, dest="minQual", help="Min base quality for T -> C conversions (default: %(default)d)")
    readseparatorparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number (default: %(default)d)")


    # stats command
    statsparser = subparsers.add_parser('rates', help='Calculate overall conversion rates on SLAM-seq datasets', formatter_class=ArgumentDefaultsHelpFormatter)
    statsparser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    statsparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", default=SUPPRESS, help="Output directory for mapped BAM files.")
    statsparser.add_argument("-r", "--reference", type=str, required=True, dest="referenceFile", default=SUPPRESS, help="Reference fasta file")
    statsparser.add_argument("-mq", "--min-basequality", type=int, required=False, default=27, dest="mq", help="Minimal base quality for SNPs")
    #statsparser.add_argument('-R', "--compute-rates", dest="overallRates", action='store_true', help="Compute overall conversion rates.")
    statsparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number")

    # context command
    tccontextparser = subparsers.add_parser('tccontext', help='Calculate T->C conversion context on SLAM-seq datasets', formatter_class=ArgumentDefaultsHelpFormatter)
    tccontextparser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    #tccontextparser.add_argument("-b", "--bed", type=str, required=True, dest="bed", help="BED file")
    tccontextparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", default=SUPPRESS, help="Output directory for mapped BAM files.")
    tccontextparser.add_argument("-r", "--reference", type=str, required=True, dest="referenceFile", default=SUPPRESS, help="Reference fasta file")
    tccontextparser.add_argument("-mq", "--min-basequality", type=int, required=False, default=0, dest="mq", help="Minimal base quality for SNPs")
    tccontextparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number")

    # stats rates utr command
    statsutrrateparser = subparsers.add_parser('utrrates', help='Calculate conversion rates per UTR on SLAM-seq datasets')
    statsutrrateparser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    statsutrrateparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", help="Output directory for mapped BAM files.")
    statsutrrateparser.add_argument("-r", "--reference", type=str, required=True, dest="referenceFile", help="Reference fasta file")
    statsutrrateparser.add_argument("-mq", "--min-basequality", type=int, required=False, default=27, dest="mq", help="Minimal base quality for SNPs (default: %(default)s)")
    statsutrrateparser.add_argument("-m", "--multiTCStringency", dest="strictTCs", action='store_true', required=False, help="")
    statsutrrateparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number (default: %(default)s)")
    statsutrrateparser.add_argument("-b", "--bed", type=str, required=True, dest="bed", help="BED file")
    statsutrrateparser.add_argument("-l", "--max-read-length", type=int, required=False, dest="maxLength", help="Max read length in BAM file (default: %(default)s)")

    # SNPeval command
    snpevalparser = subparsers.add_parser('snpeval', help='Evaluate SNP calling')
    snpevalparser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    snpevalparser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", help="Output directory for mapped BAM files.")
    snpevalparser.add_argument("-s", "--snp-directory", type=str, required=True, dest="snpDir", help="Directory containing SNP files.")
    snpevalparser.add_argument("-r", "--reference", type=str, required=True, dest="ref", help="Reference fasta file")
    snpevalparser.add_argument("-b", "--bed", type=str, required=True, dest="bed", help="BED file")
    snpevalparser.add_argument("-c", "--min-coverage", required=False, dest="cov", type=int, help="Minimum coverage to call variant (default: %(default)s)", default=10)
    snpevalparser.add_argument("-f", "--var-fraction", required=False, dest="var", type=float, help="Minimum variant fraction to call variant (default: %(default)s)", default=0.8)
    snpevalparser.add_argument("-m", "--multiTCStringency", dest="strictTCs", action='store_true', required=False, help="")
    snpevalparser.add_argument("-l", "--max-read-length", type=int, required=False, dest="maxLength", help="Max read length in BAM file (default: %(default)s)")
    snpevalparser.add_argument("-q", "--min-base-qual", type=int, default=27, required=False, dest="minQual", help="Min base quality for T -> C conversions (default: %(default)s)")
    snpevalparser.add_argument("-t", "--threads", type=int, required=False, default=1, dest="threads", help="Thread number (default: %(default)s)")

    # stats summary command
    statsSumParser = subparsers.add_parser('summary', help='Display summary information and statistics on read numbers')
    statsSumParser.add_argument('bam', action='store', help='Filtered BAM files (produced by slamdunk filter or all)' , nargs="+")
    statsSumParser.add_argument("-o", "--output", type=str, required=True, dest="outputFile", help="Output file")
    statsSumParser.add_argument("-t", "--tcountDir", type=str, required=False, dest="countDirectory", help="Folder containing tcount files")

    # merge command
    statsMergeParser = subparsers.add_parser('merge', help='Merge T->C rates from multiple sample in one TSV file', formatter_class=ArgumentDefaultsHelpFormatter)
    statsMergeParser.add_argument('countFiles', action='store', help='tCount files' , nargs="+")
    statsMergeParser.add_argument("-o", "--output", type=str, required=True, dest="outputFile", default=SUPPRESS, help="Output file")
    statsMergeParser.add_argument('-c', "--column", dest="column", type=str, required=False, default="TcReadCount / ReadCount", help="Column or expression used to summarize files.")
    statsMergeParser.add_argument('-n', "--columnname", dest="columnName", type=int, required=False, default=2, help="Index of meta data field to use as column name.")

    # stats read info command
    conversionRateParser = subparsers.add_parser('tcperreadpos', help='Calculate conversion rates per read position on SLAM-seq datasets')
    conversionRateParser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    conversionRateParser.add_argument("-r", "--reference", type=str, required=True, dest="referenceFile", help="Reference fasta file")
    conversionRateParser.add_argument("-s", "--snp-directory", type=str, required=False, dest="snpDir", help="Directory containing SNP files.")
    conversionRateParser.add_argument("-l", "--max-read-length", type=int, required=False, dest="maxLength", help="Max read length in BAM file")
    conversionRateParser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", help="Output directory for mapped BAM files.")#conversionRateParser.add_argument("-5", "--trim-5p", type=int, required=False, dest="trim5", help="Number of bp removed from 5' end of all reads.")
    conversionRateParser.add_argument("-mq", "--min-basequality", type=int, required=False, default=27, dest="mq", help="Minimal base quality for SNPs (default: %(default)s)")
    conversionRateParser.add_argument("-t", "--threads", type=int, required=False, dest="threads", default=1, help="Thread number (default: %(default)s)")

    # stats utr info command
    utrRateParser = subparsers.add_parser('tcperutrpos', help='Calculate conversion rates per UTR position on SLAM-seq datasets')
    utrRateParser.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    utrRateParser.add_argument("-r", "--reference", type=str, required=True, dest="referenceFile", help="Reference fasta file")
    utrRateParser.add_argument("-b", "--bed", type=str, required=True, dest="bed", help="BED file")
    utrRateParser.add_argument("-s", "--snp-directory", type=str, required=False, dest="snpDir", help="Directory containing SNP files.")
    utrRateParser.add_argument("-l", "--max-read-length", type=int, required=False, dest="maxLength", help="Max read length in BAM file")
    utrRateParser.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", help="Output directory for mapped BAM files.")#conversionRateParser.add_argument("-5", "--trim-5p", type=int, required=False, dest="trim5", help="Number of bp removed from 5' end of all reads.")
    utrRateParser.add_argument("-mq", "--min-basequality", type=int, required=False, default=27, dest="mq", help="Minimal base quality for SNPs (default: %(default)s)")
    utrRateParser.add_argument("-t", "--threads", type=int, required=False, dest="threads", default=1, help="Thread number (default: %(default)s)")

    # dump read info command
    dumpReadInfo = subparsers.add_parser('dump', help='Print all info available for slamdunk reads', formatter_class=ArgumentDefaultsHelpFormatter)
    dumpReadInfo.add_argument('bam', action='store', help='Bam file(s)' , nargs="+")
    dumpReadInfo.add_argument("-r", "--reference", type=str, required=True, dest="referenceFile", default=SUPPRESS, help="Reference fasta file")
    dumpReadInfo.add_argument("-s", "--snp-directory", type=str, required=True, dest="snpDir", default=SUPPRESS, help="Directory containing SNP files.")
    dumpReadInfo.add_argument("-o", "--outputDir", type=str, required=True, dest="outputDir", default=SUPPRESS, help="Output directory for mapped BAM files.")#conversionRateParser.add_argument("-5", "--trim-5p", type=int, required=False, dest="trim5", help="Number of bp removed from 5' end of all reads.")
    dumpReadInfo.add_argument("-mq", "--min-basequality", type=int, required=False, default=0, dest="mq", help="Minimal base quality for SNPs")
    dumpReadInfo.add_argument("-t", "--threads", type=int, required=False, dest="threads", default=1, help="Thread number")

    args = parser.parse_args()

    ########################################################################
    # Routine selection
    ########################################################################

    command = args.command

    if (command == "dedup") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        tcMutations = args.tcMutations
        message("Running alleyoop dedup for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runDedup)(tid, args.bam[tid], outputDirectory, tcMutations) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "collapse") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        message("Running alleyoop collapse for " + str(len(args.tcount)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runCollapse)(tid, args.tcount[tid], outputDirectory) for tid in range(0, len(args.tcount)))
        dunkFinished()

    elif (command == "positional-tracks") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        snpDirectory = args.snpDir
        n = args.threads
        message("Running alleyoop positional-tracks for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runPositionalRates)(tid, args.bam[tid], args.ref, args.minQual, args.conversionThreshold, args.coverageCutoff, outputDirectory, snpDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "read-separator") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        snpDirectory = args.snpDir
        n = args.threads
        message("Running alleyoop read-separator for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runReadSeparator)(tid, args.bam[tid], args.ref, args.minQual, args.conversionThreshold, outputDirectory, snpDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "half-lifes") :

        outputDirectory = args.outputDir
        createDir(outputDirectory)

        timepoints = args.timepoints

        message("Running alleyoop half-lifes for " + str(len(args.bam)) + " files")
        runHalfLifes(args.bam, timepoints, outputDirectory)
        dunkFinished()

    elif (command == "rates") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        referenceFile = args.referenceFile
        minMQ = args.mq
        message("Running alleyoop rates for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runStatsRates)(tid, args.bam[tid], referenceFile, minMQ, outputDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "snpeval") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        snpDirectory = args.snpDir
        n = args.threads
        message("Running alleyoop SNPeval for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runSNPeval)(tid, args.bam[tid], args.ref, args.bed, args.maxLength, args.minQual, args.cov, args.var, args.strictTCs, outputDirectory, snpDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "tccontext") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        referenceFile = args.referenceFile
        minMQ = args.mq
        message("Running alleyoop TC context for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runStatsTCContext)(tid, args.bam[tid], referenceFile, minMQ, outputDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "utrrates") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        referenceFile = args.referenceFile
        minMQ = args.mq

        message("Running alleyoop utrrates for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runStatsRatesUTR)(tid, args.bam[tid], referenceFile, minMQ, args.strictTCs, outputDirectory, args.bed, args.maxLength) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "summary") :
        message("Running alleyoop summary for " + str(len(args.bam)) + " files")
        runSummary(args.bam, args.outputFile, args.countDirectory)
        dunkFinished()

    elif (command == "merge") :
        message("Running alleyoop merge for " + str(len(args.countFiles)) + " files")
        outputLog = replaceExtension(args.outputFile, ".log")
        stats.mergeRates(",".join(args.countFiles), args.outputFile, args.column, args.columnName, getLogFile(outputLog))
        dunkFinished()

    elif (command == "tcperreadpos") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        snpDirectory = args.snpDir
        referenceFile = args.referenceFile
        minMQ = args.mq
        message("Running alleyoop tcperreadpos for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runTcPerReadPos)(tid, args.bam[tid], referenceFile, minMQ, args.maxLength, outputDirectory, snpDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "tcperutrpos") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        snpDirectory = args.snpDir
        referenceFile = args.referenceFile
        minMQ = args.mq
        snpDirectory = args.snpDir
        message("Running alleyoop tcperutrpos for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runTcPerUtr)(tid, args.bam[tid], referenceFile, args.bed, minMQ, args.maxLength, outputDirectory, snpDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    elif (command == "dump") :
        outputDirectory = args.outputDir
        createDir(outputDirectory)
        n = args.threads
        snpDirectory = args.snpDir
        referenceFile = args.referenceFile
        minMQ = args.mq
        message("Running alleyoop dump for " + str(len(args.bam)) + " files (" + str(n) + " threads)")
        results = Parallel(n_jobs=n, verbose=verbose)(delayed(runDumpReadInfo)(tid, args.bam[tid], referenceFile, minMQ, outputDirectory, snpDirectory) for tid in range(0, len(args.bam)))
        dunkFinished()

    else:
        parser.error("Too few arguments.")