Пример #1
0
def runSam2bam(inFile, outFile, log, index=True, sort=True, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False):
    if(delinFile and files_exist(outFile) and not files_exist(inFile)):
        print("Skipping sam2bam for " + outFile, file=log)
    else:
        if(onlyUnique and filterMQ == 0):
            filterMQ = 1;
             
        success = True    
        cmd = [getBinary("samtools"), "view", "-@", str(threads), "-Sb", "-o", outFile, inFile]
        if filterMQ > 0:
            cmd+=["-q", str(filterMQ)]
        if onlyProperPaired:
            cmd+=["-f", "2"]
        if not L is None:
            cmd+=["-L", L]
        run(" ".join(cmd), log, verbose=verbose, dry=dry)
         
        if(sort):         
            tmp = outFile + "_tmp"
            if(not dry):
                os.rename(outFile, tmp)                      
            run(" ".join([getBinary("samtools"), "sort", "-@", str(threads), "-o",  outFile, tmp]), log, verbose=verbose, dry=dry)
            if(success):
                removeFile(tmp)
        if(success and delinFile):
            if(not dry):
                removeFile(inFile)
         
    if(index):
        pysamIndex(outFile)
Пример #2
0
def runSam2bam(inFile,
               outFile,
               log,
               index=True,
               sort=None,
               delinFile=False,
               onlyUnique=False,
               onlyProperPaired=False,
               filterMQ=0,
               L=None,
               threads=1,
               verbose=False,
               dry=False):
    if delinFile and files_exist(outFile) and not files_exist(inFile):
        print("Skipping sam2bam for %s" % outFile, file=log)
    else:
        if onlyUnique and filterMQ == 0:
            filterMQ = 1

        success = True
        cmd = [
            "samtools view", "-@",
            str(threads), "-Sb", "-o", outFile, inFile
        ]
        if filterMQ > 0:
            cmd += ["-q", str(filterMQ)]
        if onlyProperPaired:
            cmd += ["-f", "2"]
        if L is not None:
            cmd += ["-L", L]
        run(" ".join(cmd), log, verbose=verbose, dry=dry)

        if sort is not None:
            tmp = outFile + "_tmp"
            if not dry:
                os.rename(outFile, tmp)
            if sort.lower() == "index":
                run(" ".join(
                    ["samtools sort", "-@",
                     str(threads), "-o", outFile, tmp]),
                    log,
                    verbose=verbose,
                    dry=dry)
            elif sort.lower() == "name":
                run(" ".join([
                    "samtools sort -n", "-@",
                    str(threads), "-o", outFile, tmp
                ]),
                    log,
                    verbose=verbose,
                    dry=dry)
            if success:
                removeFile(tmp)
        if success and delinFile:
            if not dry:
                removeFile(inFile)

    if index:
        pysamIndex(outFile)
Пример #3
0
def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual, outputBAMPrefix, conversionThreshold, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    samFile = pysam.AlignmentFile(bam, "rb")

    chromosomes = testFile.getChromosomes()

    backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam"
    tcReadFileName = outputBAMPrefix + "_TCReads.bam"

    backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName, "wb", template=samFile)
    tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile)

    tcReadDict = dict()

    for chromosome in chromosomes:

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold)

        for read in readIterator:
            if (read.isTcRead) :
                tcReadDict[read.name] = 0

    for read in samFile.fetch():
        if read.query_name in tcReadDict:
            tcReadFile.write(read)
        else:
            backgroundReadFile.write(read)

    backgroundReadFile.close()
    tcReadFile.close()

    pysamIndex(backgroundReadFileName)
    pysamIndex(tcReadFileName)
Пример #4
0
def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose = True, force=False):
    
    if(printOnly or checkStep([inputBAM], [outputBAM], force)):
        
        samfile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile)
        
        processedReads = 0
        retainedReads = 0

        prevChr = ""
        prevStart = ""
        
        duplicateBuffer = {}
        
        for read in samfile:
            
            flag = read.cigarstring
            chr = read.reference_id
            start = read.reference_start
            seq = read.query_sequence
            if (read.has_tag("TC")) :
                tcflag = read.get_tag("TC")
            else :
                tcflag = 0
            
            if (tcflag >= tcMutations) :
                
                if (chr != prevChr or start != prevStart) :
                                
                    if (prevChr != "") :
                        for curSeq in duplicateBuffer :
                            for curFlag in duplicateBuffer[curSeq]:
                                for readEntry in duplicateBuffer[curSeq][curFlag]:
                                    if not readEntry.is_duplicate:
                                       retainedReads += 1 
                                    outfile.write(readEntry)
                        duplicateBuffer.clear()
                
                if not seq in duplicateBuffer:
                    duplicateBuffer[seq] = {}
                if not flag in duplicateBuffer[seq]:
                    duplicateBuffer[seq][flag] = list()
                if len(duplicateBuffer[seq][flag]) > 0 :
                    read.is_duplicate = True
                duplicateBuffer[seq][flag].append(read)
                 
                prevChr = chr
                prevStart = start
            
                processedReads += 1
            
        for seq in duplicateBuffer:
            for flag in duplicateBuffer[seq] :
                for readEntry in duplicateBuffer[seq][flag]:
                    if not readEntry.is_duplicate:
                        retainedReads += 1 
                    outfile.write(readEntry)
        duplicateBuffer.clear()
        
        outfile.close()
                
        print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end = "")
        print("{0:.2f}".format(float(retainedReads) / float(processedReads)),file=log,end="")
        print(" compression rate)", file=log)
        
        pysamIndex(outputBAM)
        
    else:
        print("Skipped deduplication for " + inputBAM, file=log)
Пример #5
0
def Dedup(inputBAM,
          outputBAM,
          tcMutations,
          log,
          printOnly=False,
          verbose=True,
          force=False):

    if (printOnly or checkStep([inputBAM], [outputBAM], force)):

        samfile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile)

        processedReads = 0
        retainedReads = 0

        prevChr = ""
        prevStart = ""

        duplicateBuffer = {}

        for read in samfile:

            flag = read.cigarstring
            chr = read.reference_id
            start = read.reference_start
            seq = read.query_sequence
            if (read.has_tag("TC")):
                tcflag = read.get_tag("TC")
            else:
                tcflag = 0

            if (tcflag >= tcMutations):

                if (chr != prevChr or start != prevStart):

                    if (prevChr != ""):
                        for curSeq in duplicateBuffer:
                            for curFlag in duplicateBuffer[curSeq]:
                                for readEntry in duplicateBuffer[curSeq][
                                        curFlag]:
                                    if not readEntry.is_duplicate:
                                        retainedReads += 1
                                    outfile.write(readEntry)
                        duplicateBuffer.clear()

                if not seq in duplicateBuffer:
                    duplicateBuffer[seq] = {}
                if not flag in duplicateBuffer[seq]:
                    duplicateBuffer[seq][flag] = list()
                if len(duplicateBuffer[seq][flag]) > 0:
                    read.is_duplicate = True
                duplicateBuffer[seq][flag].append(read)

                prevChr = chr
                prevStart = start

                processedReads += 1

        for seq in duplicateBuffer:
            for flag in duplicateBuffer[seq]:
                for readEntry in duplicateBuffer[seq][flag]:
                    if not readEntry.is_duplicate:
                        retainedReads += 1
                    outfile.write(readEntry)
        duplicateBuffer.clear()

        outfile.close()

        print("Retained " + str(retainedReads) + " of " + str(processedReads) +
              " reads (",
              file=log,
              end="")
        print("{0:.2f}".format(float(retainedReads) / float(processedReads)),
              file=log,
              end="")
        print(" compression rate)", file=log)

        pysamIndex(outputBAM)

    else:
        print("Skipped deduplication for " + inputBAM, file=log)
Пример #6
0
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False):
    if(printOnly or checkStep([inputBAM], [outputBAM], force)):
        
        mappedReads = 0
        unmappedReads = 0
        filteredReads = 0
        
        mqFiltered = 0
        idFiltered = 0
        nmFiltered = 0
        multimapper = 0
        
        infile = pysam.AlignmentFile(inputBAM, "rb")    
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)
        
        # Default filtering without bed
        if (bed == None) :
            
            print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log)
            
            for read in infile:
                
                if(not read.is_secondary and not read.is_supplementary):
                    if(read.is_unmapped):
                        unmappedReads += 1
                    else:
                        mappedReads += 1
                
                if(read.is_unmapped):
                    continue
                if(read.mapping_quality < MQ):
                    mqFiltered += 1
                    continue
                if(float(read.get_tag("XI")) < minIdentity):
                    idFiltered += 1
                    continue
                if(NM > -1 and int(read.get_tag("NM")) > NM):
                    nmFiltered += 1
                    continue
                
                if(not read.is_secondary and not read.is_supplementary):
                    filteredReads += 1
                    
                outfile.write(read)
                
            print("Criterion\tFiltered reads",file=log)
            print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log)
            print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log)
            print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log)
            print("MM\t0",file=log)
        else :
            # Multimap retention strategy filtering when bed is supplied
            
            random.seed(1)
            
            print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log)
            
            mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) 
            #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)
        
        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if (bed != None) :
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else :
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""
            
            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
            #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"        
        
        slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ }
        if('PG' in inFileBamHeader):
            inFileBamHeader['PG'].append(slamDunkPG)
        else:
            inFileBamHeader['PG'] = [ slamDunkPG ]
        
        infile.close()
        outfile.close()
        
        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, verbose)
        
        pysamIndex(outputBAM)
        #pysamFlagstat(outputBAM)
        #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)
    
    else:
        print("Skipped filtering for " + inputBAM, file=log)
Пример #7
0
def Filter(inputBAM,
           outputBAM,
           log,
           bed,
           MQ=2,
           minIdentity=0.8,
           NM=-1,
           printOnly=False,
           verbose=True,
           force=False):
    if (printOnly or checkStep([inputBAM], [outputBAM], force)):

        mappedReads = 0
        unmappedReads = 0
        filteredReads = 0

        mqFiltered = 0
        idFiltered = 0
        nmFiltered = 0
        multimapper = 0

        infile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)

        # Default filtering without bed
        if (bed == None):

            print("#No bed-file supplied. Running default filtering on " +
                  inputBAM + ".",
                  file=log)

            for read in infile:

                if (not read.is_secondary and not read.is_supplementary):
                    if (read.is_unmapped):
                        unmappedReads += 1
                    else:
                        mappedReads += 1

                if (read.is_unmapped):
                    continue
                if (read.mapping_quality < MQ):
                    mqFiltered += 1
                    continue
                if (float(read.get_tag("XI")) < minIdentity):
                    idFiltered += 1
                    continue
                if (NM > -1 and int(read.get_tag("NM")) > NM):
                    nmFiltered += 1
                    continue

                if (not read.is_secondary and not read.is_supplementary):
                    filteredReads += 1

                outfile.write(read)

            print("Criterion\tFiltered reads", file=log)
            print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log)
            print("ID < " + str(minIdentity) + "\t" + str(idFiltered),
                  file=log)
            print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log)
            print("MM\t0", file=log)
        else:
            # Multimap retention strategy filtering when bed is supplied

            random.seed(1)

            print(
                "#Bed-file supplied. Running multimap retention filtering strategy on "
                + inputBAM + ".",
                file=log)

            mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment(
                infile, outfile, bed, minIdentity, NM, log)
            #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)

        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if (bed != None):
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else:
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""

            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
            #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"

        slamDunkPG = {
            'ID': 'slamdunk',
            'PN': 'slamdunk filter v' + __version__,
            'VN': __bam_version__
        }
        if ('PG' in inFileBamHeader):
            inFileBamHeader['PG'].append(slamDunkPG)
        else:
            inFileBamHeader['PG'] = [slamDunkPG]

        infile.close()
        outfile.close()

        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, verbose)

        pysamIndex(outputBAM)
        #pysamFlagstat(outputBAM)
        #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)

    else:
        print("Skipped filtering for " + inputBAM, file=log)
Пример #8
0
def Filter(inputBAM,
           outputBAM,
           log,
           bed,
           MQ=2,
           minIdentity=0.8,
           NM=-1,
           printOnly=False,
           verbose=True,
           force=False,
           paired=False):
    inputBAM = os.path.expanduser(inputBAM)
    outputBAM = os.path.expanduser(outputBAM)
    if printOnly or checkStep([inputBAM], [outputBAM], force):
        (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered,
         nmFiltered, multimapper) = 0, 0, 0, 0, 0, 0, 0

        infile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)
        # Default filtering without bed
        if bed is None:
            print("#No bed-file supplied. Running default filtering on " +
                  inputBAM + ".",
                  file=log)
            if paired:
                read1 = None
                read2 = None
            for read in infile:
                if paired:
                    if not read.is_paired or read.mate_is_unmapped or read.is_duplicate:
                        unmappedReads += 1
                        continue
                    if read.is_read2:
                        read2 = read
                    else:
                        read1 = read
                        read2 = None
                        continue

                if not read.is_secondary and not read.is_supplementary:
                    if read.is_unmapped:
                        unmappedReads += 1
                        continue
                    else:
                        mappedReads += 1

                if not paired:
                    if read.mapping_quality < MQ:
                        mqFiltered += 1
                        continue
                    if float(read.get_tag("XI")) < minIdentity:
                        idFiltered += 1
                        continue
                    if -1 < NM < int(read.get_tag("NM")):
                        nmFiltered += 1
                        continue

                    filteredReads += 1
                    outfile.write(read)
                else:
                    if read1 is None or read2 is None:
                        continue
                    if read1.query_name != read2.query_name:
                        continue

                    if read1.mapping_quality < MQ and read2.mapping_quality < MQ:
                        mqFiltered += 1
                        continue
                    if float(read1.get_tag("XI")) < minIdentity and float(
                            read2.get_tag("XI")) < minIdentity:
                        idFiltered += 1
                        continue
                    if -1 < NM < int(read1.get_tag("NM")) and -1 < NM < int(
                            read2.get_tag("NM")):
                        nmFiltered += 1
                        continue
                    filteredReads += 1
                    outfile.write(read1)
                    outfile.write(read2)

            print("Criterion\tFiltered reads", file=log)
            print("MQ < 0\t0", file=log)
            print("ID < %s\t%s" % (minIdentity, idFiltered), file=log)
            print("NM > %s\t%s" % (NM, nmFiltered), file=log)
            print("MM\t0", file=log)
        else:
            # Multimap retention strategy filtering when bed is supplied
            print(
                "#Bed-file supplied. Running multimap retention filtering strategy on "
                + inputBAM + ".",
                file=log)
            (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered,
             nmFiltered,
             multimapper) = multimapUTRRetainment(infile, outfile, bed,
                                                  minIdentity, NM, MQ, log)

        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if "RG" in inFileBamHeader and len(inFileBamHeader["RG"]) > 0:
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if bed:
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else:
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""

            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader["RG"][0]["DS"] = str(slamseqInfo)

        slamDunkPG = {
            "ID": "slamdunk",
            "PN": "slamdunk filter v" + __version__,
            "VN": __bam_version__
        }
        if "PG" in inFileBamHeader:
            inFileBamHeader["PG"].append(slamDunkPG)
        else:
            inFileBamHeader["PG"] = [slamDunkPG]

        infile.close()
        outfile.close()

        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, paired=False, verbose=verbose)
        if not paired:
            pysamIndex(outputBAM)
    else:
        print("Skipped filtering for " + inputBAM, file=log)