def test_fastq_bam(self): myInputDir = util.file.get_test_input_path(self) # Define file names inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') inHeader = os.path.join(myInputDir, 'inHeader.txt') expected1_7Sam = os.path.join(myInputDir, 'expected.java1_7.sam') expected1_8Sam = os.path.join(myInputDir, 'expected.java1_8.sam') expected1_8Sam_v15 = os.path.join(myInputDir, 'expected.java1_8_v1.5.sam') expectedFastq1 = os.path.join(myInputDir, 'expected.fastq1') outBamCmd = util.file.mkstempfname('.bam') outBamTxt = util.file.mkstempfname('.bam') outSam = util.file.mkstempfname('.sam') outFastq1 = util.file.mkstempfname('.fastq') outFastq2 = util.file.mkstempfname('.fastq') outHeader = util.file.mkstempfname('.txt') outHeaderFix = util.file.mkstempfname('.fix.txt') # in1.fastq, in2.fastq -> out.bam; header params from command-line parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) args = parser.parse_args([inFastq1, inFastq2, outBamCmd, '--sampleName', 'FreeSample', '--JVMmemory', '1g', '--picardOptions', 'LIBRARY_NAME=Alexandria', 'PLATFORM=9.75', 'SEQUENCING_CENTER=KareemAbdul-Jabbar',]) args.func_main(args) # Note for developers: if you're fixing the tests to handle non-bugs # (ie our testing here is too brittle), let's just replace a lot of this # in the future with code that just reads the header, sorts it, and # tests for equality of sorted values in the RG line (and stricter # equality in the non-header lines). This is kind of hacky. # samtools view for out.sam and compare to expected samtools = tools.samtools.SamtoolsTool() samtools.view(['-h'], outBamCmd, outSam) # picard.sam.FastqToSam outputs header fields in different order for # java version 1.8 vs 1.7/1.6, so compare both self.assertTrue(filecmp.cmp(outSam, expected1_7Sam, shallow=False) or filecmp.cmp(outSam, expected1_8Sam, shallow=False) or filecmp.cmp(outSam, expected1_8Sam_v15, shallow=False)) # in1.fastq, in2.fastq, inHeader.txt -> out.bam; header from txt parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) args = parser.parse_args([inFastq1, inFastq2, outBamTxt, '--header', inHeader]) args.func_main(args)
def main_deplete(args): ''' Run the entire depletion pipeline: bwa, bmtagger, blastn. ''' assert len(args.bmtaggerDbs) + len(args.blastDbs) + len(args.bwaDbs) > 0 # only RevertSam if inBam is already aligned # Most of the time the input will be unaligned # so we can save save time if we can skip RevertSam in the unaligned case # # via the SAM/BAM spec, if the file is aligned, an SQ line should be present # in the header. Using pysam, we can check this if header['SQ'])>0 # https://samtools.github.io/hts-specs/SAMv1.pdf # if the user has requested a revertBam with read_utils.revert_bam_if_aligned( args.inBam, revert_bam=args.revertBam, clear_tags=args.clear_tags, tags_to_clear=args.tags_to_clear, picardOptions=['MAX_DISCARD_FRACTION=0.5'], JVMmemory=args.JVMmemory, sanitize=not args.do_not_sanitize) as bamToDeplete: multi_db_deplete_bam(bamToDeplete, args.bwaDbs, deplete_bwa_bam, args.bwaBam, threads=args.threads) def bmtagger_wrapper(inBam, db, outBam, JVMmemory=None): return deplete_bmtagger_bam(inBam, db, outBam, srprism_memory=args.srprism_memory, JVMmemory=JVMmemory) multi_db_deplete_bam(args.bwaBam, args.bmtaggerDbs, bmtagger_wrapper, args.bmtaggerBam, JVMmemory=args.JVMmemory) if os.path.getsize(args.revertBam) == 0: with util.file.tempfname('.empty.sam') as empty_sam: samtools = tools.samtools.SamtoolsTool() samtools.dumpHeader(args.inBam, empty_sam) samtools.view(['-b'], empty_sam, args.revertBam) multi_db_deplete_bam(args.bmtaggerBam, args.blastDbs, deplete_blastn_bam, args.blastnBam, chunkSize=args.chunkSize, threads=args.threads, JVMmemory=args.JVMmemory) return 0
def test_fastq_bam(self) : myInputDir = util.file.get_test_input_path(self) # Define file names inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') inHeader = os.path.join(myInputDir, 'inHeader.txt') expected1_7Sam = os.path.join(myInputDir, 'expected.java1_7.sam') expected1_8Sam = os.path.join(myInputDir, 'expected.java1_8.sam') expectedFastq1 = os.path.join(myInputDir, 'expected.fastq1') outBamCmd = util.file.mkstempfname('.bam') outBamTxt = util.file.mkstempfname('.bam') outSam = util.file.mkstempfname('.sam') outFastq1 = util.file.mkstempfname('.fastq') outFastq2 = util.file.mkstempfname('.fastq') outHeader = util.file.mkstempfname('.txt') # in1.fastq, in2.fastq -> out.bam; header params from command-line parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) args = parser.parse_args([inFastq1, inFastq2, outBamCmd, '--sampleName', 'FreeSample', '--JVMmemory', '1g', '--picardOptions', 'LIBRARY_NAME=Alexandria', 'PLATFORM=9.75', 'SEQUENCING_CENTER=KareemAbdul-Jabbar', ]) args.func_main(args) # samtools view for out.sam and compare to expected samtools = tools.samtools.SamtoolsTool() samtools.view(['-h'], outBamCmd, outSam) # picard.sam.FastqToSam outputs header fields in different order for # java version 1.8 vs 1.7/1.6, so compare both self.assertTrue(filecmp.cmp(outSam, expected1_7Sam, shallow=False) or filecmp.cmp(outSam, expected1_8Sam, shallow=False)) # in1.fastq, in2.fastq, inHeader.txt -> out.bam; header from txt parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) args = parser.parse_args([inFastq1, inFastq2, outBamTxt, '--header', inHeader]) args.func_main(args) # out.bam -> out1.fastq, out2.fastq, outHeader.txt; trim 1 base from 1 parser = read_utils.parser_bam_to_fastq(argparse.ArgumentParser()) args = parser.parse_args([outBamTxt, outFastq1, outFastq2, '--outHeader', outHeader, '--JVMmemory', '1g', '--picardOptions', 'READ1_TRIM=1', ]) args.func_main(args) # compare to out1.fastq, out2.fastq, outHeader.txt to in and expected self.assertEqualContents(outFastq1, expectedFastq1) # 1 base trimmed self.assertEqualContents(outFastq2, inFastq2) self.assertEqualContents(outHeader, inHeader)
def split_bam(inBam, outBams): '''Split BAM file equally into several output BAM files. ''' samtools = tools.samtools.SamtoolsTool() picard = tools.picard.PicardTools() # get totalReadCount and maxReads # maxReads = totalReadCount / num files, but round up to the nearest # even number in order to keep read pairs together (assuming the input # is sorted in query order and has no unmated reads, which can be # accomplished by Picard RevertSam with SANITIZE=true) totalReadCount = samtools.count(inBam) maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2) log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads) # load BAM header into memory header = samtools.getHeader(inBam) if 'SO:queryname' not in header[0]: raise Exception('Input BAM file must be sorted in queryame order') # dump to bigsam bigsam = mkstempfname('.sam') samtools.view([], inBam, bigsam) # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: for outBam in outBams: log.info("preparing file " + outBam) tmp_sam_reads = mkstempfname('.sam') with open(tmp_sam_reads, 'wt') as outf: for row in header: outf.write('\t'.join(row) + '\n') for _ in range(maxReads): line = inf.readline() if not line: break outf.write(line) if outBam == outBams[-1]: for line in inf: outf.write(line) picard.execute("SamFormatConverter", [ 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' ], JVMmemory='512m') os.unlink(tmp_sam_reads) os.unlink(bigsam)
def split_bam(inBam, outBams): '''Split BAM file equally into several output BAM files. ''' samtools = tools.samtools.SamtoolsTool() picard = tools.picard.PicardTools() # get totalReadCount and maxReads # maxReads = totalReadCount / num files, but round up to the nearest # even number in order to keep read pairs together (assuming the input # is sorted in query order and has no unmated reads, which can be # accomplished by Picard RevertSam with SANITIZE=true) totalReadCount = samtools.count(inBam) maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2) log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads) # load BAM header into memory header = samtools.getHeader(inBam) if 'SO:queryname' not in header[0]: raise Exception('Input BAM file must be sorted in queryame order') # dump to bigsam bigsam = mkstempfname('.sam') samtools.view([], inBam, bigsam) # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: for outBam in outBams: log.info("preparing file " + outBam) tmp_sam_reads = mkstempfname('.sam') with open(tmp_sam_reads, 'wt') as outf: for row in header: outf.write('\t'.join(row) + '\n') for _ in range(maxReads): line = inf.readline() if not line: break outf.write(line) if outBam == outBams[-1]: for line in inf: outf.write(line) picard.execute( "SamFormatConverter", [ 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' ], JVMmemory='512m' ) os.unlink(tmp_sam_reads) os.unlink(bigsam)
def test_deplete_blastn_bam_chunked(self): tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) # Run deplete_blastn_bam inBam = os.path.join(myInputDir, 'in.bam') outBam = os.path.join(tempDir, 'out.bam') args = taxon_filter.parser_deplete_blastn_bam( argparse.ArgumentParser()).parse_args([inBam] + self.blastdbs_multi + [outBam, "--chunkSize", "1"]) args.func_main(args) # samtools view for out.sam and compare to expected outSam = os.path.join(tempDir, 'out.sam') samtools = tools.samtools.SamtoolsTool() samtools.view(['-h'], outBam, outSam) assert_equal_bam_reads(self, outSam, os.path.join(myInputDir, 'expected.sam'))
def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None, min_score_to_filter=None, threads=None, JVMmemory=None, invert_filter=False, should_index=True): """ Performs an alignment of one read group in a bam file to a reference fasta file TODO: With the addition of a third aligner to viral-ngs, the functionality common to this method and to the comparable method in the Novoalign wrapper should be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary aligner, while preserving read groups. """ options = options or [] samtools = tools.samtools.SamtoolsTool() # Require exactly one RG rgs = samtools.getReadGroups(inBam) if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: raise InvalidBamHeaderError( "{} has {} read groups, but we require exactly one".format( inBam, len(rgs))) if rgid not in rgs: raise InvalidBamHeaderError( "{} has read groups, but not {}".format(inBam, rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) # Strip inBam to just one RG (if necessary) removeInput = False if len(rgs) == 1: one_rg_inBam = inBam tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile) else: # strip inBam to one read group with util.file.tempfname('.onebam.bam') as tmp_bam: samtools.view(['-b', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty if samtools.count(tmp_bam) == 0: log.warning("No reads present for RG %s in file: %s", rgid, inBam) return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname( '.{}.in.bam'.format(rgid)) removeInput = True with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) # perform actual alignment # get the read group line to give to BWA readgroup_line = "" with open(headerFile) as inf: for line in inf: if line.startswith("@RG"): readgroup_line = line assert len(readgroup_line) > 0 #with util.file.tempfname('.aligned.bam') as tmp_bam_aligned: # rather than reheader the alignment bam file later so it has the readgroup information # from the original bam file, we'll pass the RG line to bwa to write out self.mem( one_rg_inBam, refDb, outBam, options=options + ['-R', readgroup_line.rstrip("\r\n").replace('\t', '\\t')], min_score_to_filter=min_score_to_filter, threads=threads, invert_filter=invert_filter, should_index=should_index) return (rgid, outBam) # if there was more than one RG in the input, we had to create a temporary file with the one RG specified # and we can safely delete it this file # if there was only one RG in the input, we used it directly and should not delete it if removeInput: os.unlink(one_rg_inBam)
def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None, min_score_to_filter=None, threads=None, JVMmemory=None): """ Performs an alignment of one read group in a bam file to a reference fasta file TODO: With the addition of a third aligner to viral-ngs, the functionality common to this method and to the comparable method in the Novoalign wrapper should be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary aligner, while preserving read groups. """ options = options or [] samtools = tools.samtools.SamtoolsTool() # Require exactly one RG rgs = samtools.getReadGroups(inBam) if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs))) if rgid not in rgs: raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) # Strip inBam to just one RG (if necessary) removeInput = False if len(rgs) == 1: one_rg_inBam = inBam tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile) else: # strip inBam to one read group tmp_bam = util.file.mkstempfname('.onebam.bam') samtools.view(['-b', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty if samtools.count(tmp_bam) == 0: log.warning("No reads present for RG %s in file: %s", rgid, inBam) return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid)) removeInput = True with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) os.unlink(tmp_bam) # perform actual alignment # get the read group line to give to BWA readgroup_line = "" with open(headerFile) as inf: for line in inf: if line.startswith("@RG"): readgroup_line = line assert len(readgroup_line) > 0 tmp_bam_aligned = util.file.mkstempfname('.aligned.bam') # rather than reheader the alignment bam file later so it has the readgroup information # from the original bam file, we'll pass the RG line to bwa to write out self.mem(one_rg_inBam, refDb, tmp_bam_aligned, options=options+['-R', readgroup_line.rstrip("\r\n")], min_score_to_filter=min_score_to_filter, threads=threads) # if there was more than one RG in the input, we had to create a temporary file with the one RG specified # and we can safely delete it this file # if there was only one RG in the input, we used it directly and should not delete it if removeInput: os.unlink(one_rg_inBam) # if the aligned bam file contains no reads after filtering # just create an empty file if tools.samtools.SamtoolsTool().count(tmp_bam_aligned) == 0: util.file.touch(outBam) else: # samtools reheader seems to segfault on some alignments created by bwa # so rather than reheader, BWA will write out the RG given to it via '-R' # reheadered_bam = util.file.mkstempfname('.reheadered.bam') # tools.samtools.SamtoolsTool().reheader(tmp_bam_aligned, headerFile, reheadered_bam) # os.unlink(tmp_bam_aligned) # os.unlink(headerFile) # os.system("samtools view -h {} > /Users/tomkinsc/Desktop/test_reheader.bam".format(reheadered_bam)) # sort sorter = tools.picard.SortSamTool() sorter.execute( tmp_bam_aligned, outBam, sort_order='coordinate', picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'], JVMmemory=JVMmemory )
def test_fastq_bam(self): myInputDir = util.file.get_test_input_path(self) # Define file names inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') inHeader = os.path.join(myInputDir, 'inHeader.txt') expected1_7Sam = os.path.join(myInputDir, 'expected.java1_7.sam') expected1_8Sam = os.path.join(myInputDir, 'expected.java1_8.sam') expected1_8Sam_v15 = os.path.join(myInputDir, 'expected.java1_8_v1.5.sam') expectedFastq1 = os.path.join(myInputDir, 'expected.fastq1') outBamCmd = util.file.mkstempfname('.bam') outBamTxt = util.file.mkstempfname('.bam') outSam = util.file.mkstempfname('.sam') outFastq1 = util.file.mkstempfname('.fastq') outFastq2 = util.file.mkstempfname('.fastq') outHeader = util.file.mkstempfname('.txt') # in1.fastq, in2.fastq -> out.bam; header params from command-line parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) args = parser.parse_args([ inFastq1, inFastq2, outBamCmd, '--sampleName', 'FreeSample', '--JVMmemory', '1g', '--picardOptions', 'LIBRARY_NAME=Alexandria', 'PLATFORM=9.75', 'SEQUENCING_CENTER=KareemAbdul-Jabbar', ]) args.func_main(args) # samtools view for out.sam and compare to expected samtools = tools.samtools.SamtoolsTool() samtools.view(['-h'], outBamCmd, outSam) # picard.sam.FastqToSam outputs header fields in different order for # java version 1.8 vs 1.7/1.6, so compare both self.assertTrue( filecmp.cmp(outSam, expected1_7Sam, shallow=False) or filecmp.cmp(outSam, expected1_8Sam, shallow=False) or filecmp.cmp(outSam, expected1_8Sam_v15, shallow=False)) # in1.fastq, in2.fastq, inHeader.txt -> out.bam; header from txt parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) args = parser.parse_args( [inFastq1, inFastq2, outBamTxt, '--header', inHeader]) args.func_main(args) # out.bam -> out1.fastq, out2.fastq, outHeader.txt; trim 1 base from 1 parser = read_utils.parser_bam_to_fastq(argparse.ArgumentParser()) args = parser.parse_args([ outBamTxt, outFastq1, outFastq2, '--outHeader', outHeader, '--JVMmemory', '1g', '--picardOptions', 'READ1_TRIM=1', ]) args.func_main(args) # compare to out1.fastq, out2.fastq, outHeader.txt to in and expected self.assertEqualContents(outFastq1, expectedFastq1) # 1 base trimmed self.assertEqualContents(outFastq2, inFastq2) self.assertEqualContents(outHeader, inHeader)
def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None, min_score_to_filter=None, threads=None, JVMmemory=None, invert_filter=False, should_index=True): """ Performs an alignment of one read group in a bam file to a reference fasta file TODO: With the addition of a third aligner to viral-ngs, the functionality common to this method and to the comparable method in the Novoalign wrapper should be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary aligner, while preserving read groups. """ options = options or [] samtools = tools.samtools.SamtoolsTool() # Require exactly one RG rgs = samtools.getReadGroups(inBam) if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs))) if rgid not in rgs: raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) # Strip inBam to just one RG (if necessary) removeInput = False if len(rgs) == 1: one_rg_inBam = inBam tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile) else: # strip inBam to one read group with util.file.tempfname('.onebam.bam') as tmp_bam: samtools.view(['-b', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty if samtools.count(tmp_bam) == 0: log.warning("No reads present for RG %s in file: %s", rgid, inBam) return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid)) removeInput = True with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) # perform actual alignment # get the read group line to give to BWA readgroup_line = "" with open(headerFile) as inf: for line in inf: if line.startswith("@RG"): readgroup_line = line assert len(readgroup_line) > 0 #with util.file.tempfname('.aligned.bam') as tmp_bam_aligned: # rather than reheader the alignment bam file later so it has the readgroup information # from the original bam file, we'll pass the RG line to bwa to write out self.mem(one_rg_inBam, refDb, outBam, options=options+['-R', readgroup_line.rstrip("\r\n").replace('\t','\\t')], min_score_to_filter=min_score_to_filter, threads=threads, invert_filter=invert_filter, should_index=should_index) return (rgid, outBam) # if there was more than one RG in the input, we had to create a temporary file with the one RG specified # and we can safely delete it this file # if there was only one RG in the input, we used it directly and should not delete it if removeInput: os.unlink(one_rg_inBam)
def align_one_rg(self, inBam, refDb, outBam, rgid=None, preset=None, options=None, threads=None, JVMmemory=None): """ Performs an alignment of one read group in a bam file to a reference fasta file using minimap2. Emits alignments in sorted, index bam files. inBam may contain more read groups, but we will subset input to the specified rgid. preset may be specified as a valid value for "minimap2 -x" which depends on the type of data (short accurate reads vs long noisy reads). If preset is set to None, we will autodetect based on the PL (platform) tag in the read group header (e.g. illumina, ont, pacbio) """ options = list(options).copy() or [] samtools = tools.samtools.SamtoolsTool() # Require exactly one RG rgs = samtools.getReadGroups(inBam) if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs))) if rgid not in rgs: raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) # Strip inBam to just one RG (if necessary) removeInput = False if len(rgs) == 1: one_rg_inBam = inBam tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile) else: # strip inBam to one read group with util.file.tempfname('.onebam.bam') as tmp_bam: samtools.view(['-1', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty if samtools.isEmpty(tmp_bam): log.warning("No reads present for RG %s in file: %s", rgid, inBam) shutil.copyfile(tmp_bam, outBam) return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid)) removeInput = True with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) # get the read group line to give to mm2 readgroup_line = "" with open(headerFile) as inf: for line in inf: if line.startswith("@RG"): readgroup_line = line.rstrip("\r\n") if not readgroup_line: raise Exception() # rather than reheader the alignment bam file later so it has the readgroup information # from the original bam file, we'll pass the RG line to minimap2 to write out options.extend(('-R', readgroup_line.replace('\t','\\t'))) # dynamically determine the mode of operation if '-x' not in options: if preset is None: platform = list(x for x in readgroup_line.split('\t') if x.startswith('PL:')) if len(platform) != 1: raise Exception("cannot autodetect minimap2 aligner mode when PL: tag is not set in the read group header for {}: {}".format(inBam, readgroup_line)) else: platform = platform[0][3:].lower() if platform == 'illumina': preset = 'sr' elif platform == 'ont': preset = 'map-ont' elif platform == 'pacbio': preset = 'map-pb' else: raise Exception("PL: tag {} for read group {} in bam {} refers to a data type we do not know how to map with minimap2".format(platform, rgid, inBam)) options.extend(('-x', preset)) # perform actual alignment if samtools.isEmpty(one_rg_inBam): # minimap doesn't like empty inputs, so copy empty bam through samtools.sort(one_rg_inBam, outBam) else: self.align_cmd(one_rg_inBam, refDb, outBam, options=options, threads=threads) # if there was more than one RG in the input, we had to create a temporary file with the one RG specified # and we can safely delete it this file # if there was only one RG in the input, we used it directly and should not delete it if removeInput: os.unlink(one_rg_inBam)
def align_and_plot_coverage( out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, out_summary, in_bam, ref_fasta, out_bam=None, sensitive=False, excludeDuplicates=False, JVMmemory=None, picardOptions=None, min_score_to_output=None ): ''' Take reads, align to reference with BWA-MEM, and generate a coverage plot ''' if out_bam is None: bam_aligned = util.file.mkstempfname('.aligned.bam') else: bam_aligned = out_bam ref_indexed = util.file.mkstempfname('.reference.fasta') shutil.copyfile(ref_fasta, ref_indexed) bwa = tools.bwa.Bwa() samtools = tools.samtools.SamtoolsTool() bwa.index(ref_indexed) bwa_opts = [] if sensitive: bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split() map_threshold = min_score_to_output or 30 bwa_opts + ["-T", str(map_threshold)] aln_bam = util.file.mkstempfname('.bam') bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts) # @haydenm says: # For some reason (particularly when the --sensitive option is on), bwa # doesn't listen to its '-T' flag and outputs alignments with score less # than the '-T 30' threshold. So filter these: aln_bam_filtered = util.file.mkstempfname('.filtered.bam') samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered) os.unlink(aln_bam) aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam') if excludeDuplicates: opts = list(picardOptions) dupe_removal_out_metrics = util.file.mkstempfname('.metrics') tools.picard.MarkDuplicatesTool().execute( [aln_bam_filtered], aln_bam_dupe_processed, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=JVMmemory ) else: aln_bam_dupe_processed = aln_bam_filtered samtools.sort(aln_bam_dupe_processed, bam_aligned) os.unlink(aln_bam_filtered) if excludeDuplicates: os.unlink(aln_bam_dupe_processed) samtools.index(bam_aligned) # -- call plot function -- plot_coverage( bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary ) # remove the output bam, unless it is needed if out_bam is None: os.unlink(bam_aligned) # remove the files created by bwa index. # The empty extension causes the original fasta file to be removed for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]: file_to_remove = ref_indexed + ext if os.path.isfile(file_to_remove): os.unlink(file_to_remove)
def plot_coverage( in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, out_summary=None ): ''' Generate a coverage plot from an aligned bam file ''' # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam ) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024"], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # call samtools sort bam_sorted = util.file.mkstempfname('.sorted.bam') samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: opts += ["-m", str(max_coverage_depth)] if read_length_threshold: opts += ["-l", str(read_length_threshold)] samtools.depth(bam_sorted, coverage_tsv_file, opts) os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 with plt.style.context(plot_style): fig = plt.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color']) # get the colors for this style segment_color = colors[segment_num % len(colors)] # pick a color, offset by the segment index if plot_data_style == "filled": plt.fill_between( range(prior_domain_max, domain_max), position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color ) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": plt.plot( range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color ) plt.title(plot_title, fontsize=font_size * 1.2) plt.xlabel("bp", fontsize=font_size * 1.1) plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' # TODO: remove this: #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv" samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024"], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # call samtools sort bam_sorted = util.file.mkstempfname('.sorted.bam') samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(int(row[2])) domain_max += 1 domain_max = 0 with plt.style.context(plot_style): fig = plt.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = plt.subplot() # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list(plt.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index if plot_data_style == "filled": plt.fill_between(range(prior_domain_max, domain_max), position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": plt.plot(range(prior_domain_max, domain_max), position_depths, 'ro', antialiased=True, color=segment_color) plt.title(plot_title, fontsize=font_size * 1.2) plt.xlabel("bp", fontsize=font_size * 1.1) plt.ylabel("read depth", fontsize=font_size * 1.1) # to squash a backend renderer error on OSX related to tight layout if plt.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() plt.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def plot_coverage(in_bam, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title, plot_x_limits, plot_y_limits, base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, plot_only_non_duplicates=False, bin_large_plots=False, binning_summary_statistic="max", out_summary=None): ''' Generate a coverage plot from an aligned bam file ''' samtools = tools.samtools.SamtoolsTool() # check if in_bam is aligned, if not raise an error num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"]) if num_mapped_reads == 0: raise Exception( """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s""" % in_bam) if out_summary is None: coverage_tsv_file = util.file.mkstempfname('.summary.tsv') else: coverage_tsv_file = out_summary bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam') if plot_only_non_duplicates: # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates) samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed) else: bam_dupe_processed = in_bam # only sort if not sorted bam_sorted = util.file.mkstempfname('.sorted.bam') should_remove_sorted = True if not util.file.bam_is_sorted(bam_dupe_processed): samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"]) if plot_only_non_duplicates: os.unlink(bam_dupe_processed) else: bam_sorted = bam_dupe_processed if not plot_only_non_duplicates: # in this case we are passing through the original in_bam directly should_remove_sorted = False # call samtools index samtools.index(bam_sorted) # call samtools depth opts = [] opts += ['-aa'] # report coverate at "absolutely all" positions if base_q_threshold: if not plot_only_non_duplicates: # Note: "bedtools genomecov" will count depth including duplicates, but does # not expose options for filtering by quality. When duplicates # are excluded, "samtools depth" is used which does support quality filtering # We use either samtools or bedtools, because the former ignores marked duplicates # from its depth count while bedtools includes them. log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-q", str(base_q_threshold)] if mapping_q_threshold: if not plot_only_non_duplicates: log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent") opts += ["-Q", str(mapping_q_threshold)] if max_coverage_depth: if not plot_only_non_duplicates: log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent") opts += ["-m", str(max_coverage_depth)] if read_length_threshold: if not plot_only_non_duplicates: log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent") opts += ["-l", str(read_length_threshold)] # add option here for bedtools to report coverage w/ duplicates # (and then samtools for no-dups) # # Ex. # samtools depth -aa mapped-to-ref.with-dups.tmp.bam # bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d if not plot_only_non_duplicates: bt = BedTool(bam_sorted) # "d=True" is the equivalent of passing "-d" to the bedtools CLI bt.genome_coverage(d=True).saveas(coverage_tsv_file) else: samtools.depth(bam_sorted, coverage_tsv_file, opts) # only remove the sorted bam if it is not the original input bam # which we use directly in some casess if should_remove_sorted: os.unlink(bam_sorted) # ---- create plot based on coverage_tsv_file ---- segment_depths = OrderedDict() domain_max = 0 with open(coverage_tsv_file, "r") as tabfile: for row in csv.reader(tabfile, delimiter='\t'): segment_depths.setdefault(row[0], []).append(float(row[2])) domain_max += 1 with matplotlib.pyplot.style.context(plot_style): fig = matplotlib.pyplot.gcf() DPI = plot_dpi or fig.get_dpi() fig.set_size_inches( float(plot_width) / float(DPI), float(plot_height) / float(DPI)) font_size = (2.5 * plot_height) / float(DPI) ax = matplotlib.pyplot.subplot( ) # Defines ax variable by creating an empty plot # Set the tick labels font for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(font_size) # Binning bin_size = 1 if bin_large_plots: # Bin locations and take summary value (maximum or minimum) in each bin binning_fn = { "min": min, "max": max, "mean": mean, "median": median } binning_action = binning_fn.get(binning_summary_statistic, "max") inner_plot_width_inches = ax.get_window_extent().transformed( fig.dpi_scale_trans.inverted()).width inner_plot_width_px = inner_plot_width_inches * fig.dpi # width of actual plot (sans whitespace and y axis text) bins_per_pixel = 1 # increase to make smaller (but less visible) bins bin_size = 1 + int(domain_max / (inner_plot_width_px * bins_per_pixel)) binned_segment_depths = OrderedDict() for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): summary_depths_in_bins = [ binning_action(position_depths[i:i + bin_size]) for i in range(0, len(position_depths), bin_size) ] binned_segment_depths[segment_name] = summary_depths_in_bins segment_depths = binned_segment_depths # Plotting domain_max = 0 for segment_num, (segment_name, position_depths) in enumerate( segment_depths.items()): prior_domain_max = domain_max domain_max += len(position_depths) colors = list( matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key() ['color']) # get the colors for this style segment_color = colors[ segment_num % len(colors)] # pick a color, offset by the segment index x_values = range(prior_domain_max, domain_max) x_values = [x * bin_size for x in x_values] if plot_data_style == "filled": matplotlib.pyplot.fill_between(x_values, position_depths, [0] * len(position_depths), linewidth=0, antialiased=True, color=segment_color) elif plot_data_style == "line": matplotlib.pyplot.plot(x_values, position_depths, antialiased=True, color=segment_color) elif plot_data_style == "dots": matplotlib.pyplot.plot(x_values, position_depths, 'ro', antialiased=True, color=segment_color) matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2) matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1) ylabel = "read depth" if (bin_size > 1): ylabel = "read depth ({summary} in {size}-bp bin)".format( size=bin_size, summary=binning_summary_statistic) matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1) if plot_x_limits is not None: x_min, x_max = plot_x_limits matplotlib.pyplot.xlim(x_min, x_max) if plot_y_limits is not None: y_min, y_max = plot_y_limits matplotlib.pyplot.ylim(y_min, y_max) # to squash a backend renderer error on OSX related to tight layout if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']: fig.set_tight_layout(True) else: fig.tight_layout() matplotlib.pyplot.savefig(out_plot_file, format=plot_format, dpi=DPI) #, bbox_inches='tight') log.info("Coverage plot saved to: " + out_plot_file) if not out_summary: os.unlink(coverage_tsv_file)
def align_one_rg_bam(self, inBam, refFasta, outBam, rgid=None, rgs=None, options=None, min_qual=0, JVMmemory=None): ''' Execute Novoalign on BAM inputs and outputs. Requires that only one RG exists (will error otherwise). Use Picard to sort and index the output BAM. If min_qual>0, use Samtools to filter on mapping quality. ''' options = options or ["-r", "Random"] samtools = tools.samtools.SamtoolsTool() # Require exactly one RG rgs = rgs if rgs is not None else samtools.getReadGroups(inBam) if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs))) if rgid not in rgs: raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid)) #rg = rgs[rgid] # Strip inBam to just one RG (if necessary) if len(rgs) == 1: one_rg_inBam = inBam else: # strip inBam to one read group tmp_bam = util.file.mkstempfname('.onebam.bam') samtools.view(['-b', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty if samtools.count(tmp_bam) == 0: return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) os.unlink(tmp_bam) os.unlink(headerFile) # Novoalign tmp_sam = util.file.mkstempfname('.novoalign.sam') tmp_sam_err = util.file.mkstempfname('.novoalign.sam.err') cmd = [self.install_and_get_path(), '-f', one_rg_inBam] + list(map(str, options)) cmd = cmd + ['-F', 'BAM', '-d', self._fasta_to_idx_name(refFasta), '-o', 'SAM'] _log.debug(' '.join(cmd)) with open(tmp_sam, 'wt') as outf: util.misc.run_and_save(cmd, outf=outf) # Samtools filter (optional) if min_qual: tmp_bam2 = util.file.mkstempfname('.filtered.bam') cmd = [samtools.install_and_get_path(), 'view', '-b', '-S', '-1', '-q', str(min_qual), tmp_sam] _log.debug('%s > %s', ' '.join(cmd), tmp_bam2) with open(tmp_bam2, 'wb') as outf: util.misc.run_and_save(cmd, outf=outf) os.unlink(tmp_sam) tmp_sam = tmp_bam2 # Picard SortSam sorter = tools.picard.SortSamTool() sorter.execute( tmp_sam, outBam, sort_order='coordinate', picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'], JVMmemory=JVMmemory )
def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None, min_qual=30, threads=None, JVMmemory=None): """ Performs an alignment of one read group in a bam file to a reference fasta file TODO: With the addition of a third aligner to viral-ngs, the functionality common to this method and to the comparable method in the Novoalign wrapper should be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary aligner, while preserving read groups. """ options = options or [] samtools = tools.samtools.SamtoolsTool() # Require exactly one RG rgs = samtools.getReadGroups(inBam) if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs))) if rgid not in rgs: raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) # Strip inBam to just one RG (if necessary) removeInput = False if len(rgs) == 1: one_rg_inBam = inBam tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile) else: # strip inBam to one read group tmp_bam = util.file.mkstempfname('.onebam.bam') samtools.view(['-b', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty if samtools.count(tmp_bam) == 0: return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid)) removeInput = True with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) os.unlink(tmp_bam) # perform actual alignment # get the read group line to give to BWA readgroup_line = "" with open(headerFile) as inf: for line in inf: if line.startswith("@RG"): readgroup_line = line assert len(readgroup_line) > 0 aln_bam_prefilter = util.file.mkstempfname('.prefiltered.bam') # rather than reheader the alignment bam file later so it has the readgroup information # from the original bam file, we'll pass the RG line to bwa to write out self.mem(one_rg_inBam, refDb, aln_bam_prefilter, options=options+['-R', readgroup_line.rstrip("\n").rstrip("\r")], min_qual=min_qual, threads=threads) # if there was more than one RG in the input, we had to create a temporary file with the one RG specified # and we can safely delete it this file # if there was only one RG in the input, we used it directly and should not delete it if removeInput: os.unlink(one_rg_inBam) # @haydenm says: # For some reason (particularly when the --sensitive option is on), bwa # doesn't listen to its '-T' flag and outputs alignments with score less # than the '-T 30' threshold. So filter these: if min_qual > 0: tmp_bam_aligned = util.file.mkstempfname('.aligned.bam') tools.samtools.SamtoolsTool().view(["-b", "-h", "-q", str(min_qual)], aln_bam_prefilter, tmp_bam_aligned) os.unlink(aln_bam_prefilter) else: shutil.move(aln_bam_prefilter, tmp_bam_aligned) # if the aligned bam file contains no reads after filtering # just create an empty file if tools.samtools.SamtoolsTool().count(tmp_bam_aligned) == 0: util.file.touch(outBam) else: # samtools reheader seems to segfault on some alignments created by bwa # so rather than reheader, BWA will write out the RG given to it via '-R' # reheadered_bam = util.file.mkstempfname('.reheadered.bam') # tools.samtools.SamtoolsTool().reheader(tmp_bam_aligned, headerFile, reheadered_bam) # os.unlink(tmp_bam_aligned) # os.unlink(headerFile) # os.system("samtools view -h {} > /Users/tomkinsc/Desktop/test_reheader.bam".format(reheadered_bam)) # sort sorter = tools.picard.SortSamTool() sorter.execute( tmp_bam_aligned, outBam, sort_order='coordinate', picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'], JVMmemory=JVMmemory )