def runSam2bam(inFile, outFile, log, index=True, sort=True, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False): if(delinFile and files_exist(outFile) and not files_exist(inFile)): print("Skipping sam2bam for " + outFile, file=log) else: if(onlyUnique and filterMQ == 0): filterMQ = 1; success = True cmd = [getBinary("samtools"), "view", "-@", str(threads), "-Sb", "-o", outFile, inFile] if filterMQ > 0: cmd+=["-q", str(filterMQ)] if onlyProperPaired: cmd+=["-f", "2"] if not L is None: cmd+=["-L", L] run(" ".join(cmd), log, verbose=verbose, dry=dry) if(sort): tmp = outFile + "_tmp" if(not dry): os.rename(outFile, tmp) run(" ".join([getBinary("samtools"), "sort", "-@", str(threads), "-o", outFile, tmp]), log, verbose=verbose, dry=dry) if(success): removeFile(tmp) if(success and delinFile): if(not dry): removeFile(inFile) if(index): pysamIndex(outFile)
def runSam2bam(inFile, outFile, log, index=True, sort=None, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False): if delinFile and files_exist(outFile) and not files_exist(inFile): print("Skipping sam2bam for %s" % outFile, file=log) else: if onlyUnique and filterMQ == 0: filterMQ = 1 success = True cmd = [ "samtools view", "-@", str(threads), "-Sb", "-o", outFile, inFile ] if filterMQ > 0: cmd += ["-q", str(filterMQ)] if onlyProperPaired: cmd += ["-f", "2"] if L is not None: cmd += ["-L", L] run(" ".join(cmd), log, verbose=verbose, dry=dry) if sort is not None: tmp = outFile + "_tmp" if not dry: os.rename(outFile, tmp) if sort.lower() == "index": run(" ".join( ["samtools sort", "-@", str(threads), "-o", outFile, tmp]), log, verbose=verbose, dry=dry) elif sort.lower() == "name": run(" ".join([ "samtools sort -n", "-@", str(threads), "-o", outFile, tmp ]), log, verbose=verbose, dry=dry) if success: removeFile(tmp) if success and delinFile: if not dry: removeFile(inFile) if index: pysamIndex(outFile)
def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual, outputBAMPrefix, conversionThreshold, log): ref = pysam.FastaFile(referenceFile) snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) samFile = pysam.AlignmentFile(bam, "rb") chromosomes = testFile.getChromosomes() backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam" tcReadFileName = outputBAMPrefix + "_TCReads.bam" backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName, "wb", template=samFile) tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile) tcReadDict = dict() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold) for read in readIterator: if (read.isTcRead) : tcReadDict[read.name] = 0 for read in samFile.fetch(): if read.query_name in tcReadDict: tcReadFile.write(read) else: backgroundReadFile.write(read) backgroundReadFile.close() tcReadFile.close() pysamIndex(backgroundReadFileName) pysamIndex(tcReadFileName)
def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose = True, force=False): if(printOnly or checkStep([inputBAM], [outputBAM], force)): samfile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile) processedReads = 0 retainedReads = 0 prevChr = "" prevStart = "" duplicateBuffer = {} for read in samfile: flag = read.cigarstring chr = read.reference_id start = read.reference_start seq = read.query_sequence if (read.has_tag("TC")) : tcflag = read.get_tag("TC") else : tcflag = 0 if (tcflag >= tcMutations) : if (chr != prevChr or start != prevStart) : if (prevChr != "") : for curSeq in duplicateBuffer : for curFlag in duplicateBuffer[curSeq]: for readEntry in duplicateBuffer[curSeq][curFlag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() if not seq in duplicateBuffer: duplicateBuffer[seq] = {} if not flag in duplicateBuffer[seq]: duplicateBuffer[seq][flag] = list() if len(duplicateBuffer[seq][flag]) > 0 : read.is_duplicate = True duplicateBuffer[seq][flag].append(read) prevChr = chr prevStart = start processedReads += 1 for seq in duplicateBuffer: for flag in duplicateBuffer[seq] : for readEntry in duplicateBuffer[seq][flag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() outfile.close() print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end = "") print("{0:.2f}".format(float(retainedReads) / float(processedReads)),file=log,end="") print(" compression rate)", file=log) pysamIndex(outputBAM) else: print("Skipped deduplication for " + inputBAM, file=log)
def Dedup(inputBAM, outputBAM, tcMutations, log, printOnly=False, verbose=True, force=False): if (printOnly or checkStep([inputBAM], [outputBAM], force)): samfile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=samfile) processedReads = 0 retainedReads = 0 prevChr = "" prevStart = "" duplicateBuffer = {} for read in samfile: flag = read.cigarstring chr = read.reference_id start = read.reference_start seq = read.query_sequence if (read.has_tag("TC")): tcflag = read.get_tag("TC") else: tcflag = 0 if (tcflag >= tcMutations): if (chr != prevChr or start != prevStart): if (prevChr != ""): for curSeq in duplicateBuffer: for curFlag in duplicateBuffer[curSeq]: for readEntry in duplicateBuffer[curSeq][ curFlag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() if not seq in duplicateBuffer: duplicateBuffer[seq] = {} if not flag in duplicateBuffer[seq]: duplicateBuffer[seq][flag] = list() if len(duplicateBuffer[seq][flag]) > 0: read.is_duplicate = True duplicateBuffer[seq][flag].append(read) prevChr = chr prevStart = start processedReads += 1 for seq in duplicateBuffer: for flag in duplicateBuffer[seq]: for readEntry in duplicateBuffer[seq][flag]: if not readEntry.is_duplicate: retainedReads += 1 outfile.write(readEntry) duplicateBuffer.clear() outfile.close() print("Retained " + str(retainedReads) + " of " + str(processedReads) + " reads (", file=log, end="") print("{0:.2f}".format(float(retainedReads) / float(processedReads)), file=log, end="") print(" compression rate)", file=log) pysamIndex(outputBAM) else: print("Skipped deduplication for " + inputBAM, file=log)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): if(printOnly or checkStep([inputBAM], [outputBAM], force)): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 multimapper = 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if (bed == None) : print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log) for read in infile: if(not read.is_secondary and not read.is_supplementary): if(read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 if(read.is_unmapped): continue if(read.mapping_quality < MQ): mqFiltered += 1 continue if(float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if(NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if(not read.is_secondary and not read.is_supplementary): filteredReads += 1 outfile.write(read) print("Criterion\tFiltered reads",file=log) print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log) print("MM\t0",file=log) else : # Multimap retention strategy filtering when bed is supplied random.seed(1) print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log) mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if (bed != None) : slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else : slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } if('PG' in inFileBamHeader): inFileBamHeader['PG'].append(slamDunkPG) else: inFileBamHeader['PG'] = [ slamDunkPG ] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, verbose) pysamIndex(outputBAM) #pysamFlagstat(outputBAM) #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) else: print("Skipped filtering for " + inputBAM, file=log)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): if (printOnly or checkStep([inputBAM], [outputBAM], force)): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 multimapper = 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if (bed == None): print("#No bed-file supplied. Running default filtering on " + inputBAM + ".", file=log) for read in infile: if (not read.is_secondary and not read.is_supplementary): if (read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 if (read.is_unmapped): continue if (read.mapping_quality < MQ): mqFiltered += 1 continue if (float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if (NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if (not read.is_secondary and not read.is_supplementary): filteredReads += 1 outfile.write(read) print("Criterion\tFiltered reads", file=log) print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered), file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log) print("MM\t0", file=log) else: # Multimap retention strategy filtering when bed is supplied random.seed(1) print( "#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".", file=log) mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment( infile, outfile, bed, minIdentity, NM, log) #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if (bed != None): slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else: slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } if ('PG' in inFileBamHeader): inFileBamHeader['PG'].append(slamDunkPG) else: inFileBamHeader['PG'] = [slamDunkPG] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, verbose) pysamIndex(outputBAM) #pysamFlagstat(outputBAM) #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) else: print("Skipped filtering for " + inputBAM, file=log)
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False, paired=False): inputBAM = os.path.expanduser(inputBAM) outputBAM = os.path.expanduser(outputBAM) if printOnly or checkStep([inputBAM], [outputBAM], force): (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper) = 0, 0, 0, 0, 0, 0, 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if bed is None: print("#No bed-file supplied. Running default filtering on " + inputBAM + ".", file=log) if paired: read1 = None read2 = None for read in infile: if paired: if not read.is_paired or read.mate_is_unmapped or read.is_duplicate: unmappedReads += 1 continue if read.is_read2: read2 = read else: read1 = read read2 = None continue if not read.is_secondary and not read.is_supplementary: if read.is_unmapped: unmappedReads += 1 continue else: mappedReads += 1 if not paired: if read.mapping_quality < MQ: mqFiltered += 1 continue if float(read.get_tag("XI")) < minIdentity: idFiltered += 1 continue if -1 < NM < int(read.get_tag("NM")): nmFiltered += 1 continue filteredReads += 1 outfile.write(read) else: if read1 is None or read2 is None: continue if read1.query_name != read2.query_name: continue if read1.mapping_quality < MQ and read2.mapping_quality < MQ: mqFiltered += 1 continue if float(read1.get_tag("XI")) < minIdentity and float( read2.get_tag("XI")) < minIdentity: idFiltered += 1 continue if -1 < NM < int(read1.get_tag("NM")) and -1 < NM < int( read2.get_tag("NM")): nmFiltered += 1 continue filteredReads += 1 outfile.write(read1) outfile.write(read2) print("Criterion\tFiltered reads", file=log) print("MQ < 0\t0", file=log) print("ID < %s\t%s" % (minIdentity, idFiltered), file=log) print("NM > %s\t%s" % (NM, nmFiltered), file=log) print("MM\t0", file=log) else: # Multimap retention strategy filtering when bed is supplied print( "#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".", file=log) (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper) = multimapUTRRetainment(infile, outfile, bed, minIdentity, NM, MQ, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if "RG" in inFileBamHeader and len(inFileBamHeader["RG"]) > 0: slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if bed: slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else: slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader["RG"][0]["DS"] = str(slamseqInfo) slamDunkPG = { "ID": "slamdunk", "PN": "slamdunk filter v" + __version__, "VN": __bam_version__ } if "PG" in inFileBamHeader: inFileBamHeader["PG"].append(slamDunkPG) else: inFileBamHeader["PG"] = [slamDunkPG] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, paired=False, verbose=verbose) if not paired: pysamIndex(outputBAM) else: print("Skipped filtering for " + inputBAM, file=log)