def rmdup(self): msg = "Removing duplicates in %s" % self.bamfn print(msg) tempfn_stem = os.path.join(self.basedir, temp_filename()) pysam.rmdup("-s", self.bamfn, tempfn_stem) tempfn_glob = glob.glob(tempfn_stem + '*') assert len(tempfn_glob) == 1, "Unexpected number of temporary output files: %r" % tempfn_glob tempfn = tempfn_glob[0] # rename our dedupped bamfn os.rename(tempfn, self.bamfn)
def proc_sam(arg): samfile = arg[0] rmdup = arg[1] #se = arg[2] print samfile print rmdup sam_dir = "/".join(samfile.split("/")[:-1]) + "/" sam_prefix = os.path.basename(samfile).split(".sam")[0] mapped_sam = sam_dir + sam_prefix + "_mapped.sam" rmdup_sam = sam_dir + sam_prefix + "_rmdup.sam" sort_sam = sam_dir + sam_prefix + "_sort" if not os.path.exists(mapped_sam): print "Removing unmapped..." sam = pysam.Samfile(samfile, 'r') mb = pysam.Samfile(mapped_sam, 'w', template=sam) for read in sam: if not read.is_unmapped: mb.write(read) mb.close() print "Finished removing unmapped." if not os.path.exists(rmdup_sam) and rmdup == "True": print "Removing duplicates..." pysam.rmdup("-S", mapped_sam, rmdup_sam) os.remove(mapped_sam) print "Sorting..." pysam.sort(rmdup_sam, sort_sam) os.remove(rmdup_sam) else: print "Sorting..." pysam.sort(mapped_sam, sort_sam) os.remove(mapped_sam) print "Indexing..." sort_sam = sort_sam + ".sam" pysam.index(sort_sam) samfile_fs = open(samfile + "_stat", 'w') for line in pysam.flagstat(samfile): samfile_fs.write(line) samfile_fs.close sort_sam_fs = open(sort_sam + "_stat", 'w') for line in pysam.flagstat(sort_sam): sort_sam_fs.write(line) sort_sam_fs.close()
def trim_reads(bamfile): """ Wrapper to remove PCR duplicate reads from bed file Input bamfile -- location of bamfile on disk assumes .bam ending of bam file returns bamfile_trimed.bam file """ if not os.path.exists(bamfile): raise NameError("file %s does not exist" % (bamfile)) outfile = ".".join(bamfile.split(".")[:-1]) outfile += ".rmdup.bam" rmdup("-S", bamfile, outfile) return outfile
def rm_dup(self, inbam, outbam): ''' remove pcr duplicates ''' pysam.rmdup(inbam, outbam) return
def rm_dup(self,inbam,outbam): ''' remove pcr duplicates ''' pysam.rmdup(inbam,outbam) return
def filter_main(fastq1, fastq2, bwa_index, mapq, outdir, prefix, threads, to_file=False): sys.stdout = logger.Logger(outdir + "/" + prefix + ".feather.log") print(time.ctime() + " starting mapping and filtering operation") check_arguments(fastq1, fastq2, bwa_index, mapq, threads) paired_filename, bwa1_filename, bwa2_filename, bwa1_sorted_filename, bwa2_sorted_filename, combined_bwa_filename, qc_filename = set_filenames( fastq1, fastq2, outdir, prefix) #running bwa mem for fastq, bwa_filename in [(fastq1, bwa1_filename), (fastq2, bwa2_filename)]: if fastq.endswith(".fastq") or fastq.endswith("fastq.gz"): bwa_mem(fastq, bwa_index, threads, bwa_filename) elif not (fastq.endswith(".sam") or fastq.endswith(".bam")): exit( "Error: Input file for filtering should be of type fastq, fastq.gz, sam, or bam. Exiting!" ) if bwa1_filename.endswith(".bam"): proc = subprocess.Popen("samtools view " + bwa1_filename + " | awk ' $1 !~ /@/ {print $1}' " + "| uniq -c|wc -l", stdout=subprocess.PIPE, shell=True) read_count = proc.stdout.read().decode("utf-8") else: proc = subprocess.Popen("awk ' $1 !~ /@/ {print $1}' " + bwa1_filename + "| uniq -c|wc -l", stdout=subprocess.PIPE, shell=True) read_count = proc.stdout.read().decode("utf-8") #pairing and filtering alignments for chimeric reads for bwa_filename, bwa_sorted_filename in ([ bwa1_filename, bwa1_sorted_filename ], [bwa2_filename, bwa2_sorted_filename]): bwa = pysam.AlignmentFile(bwa_filename) if not is_sorted_queryname(bwa.header): print(time.ctime() + " calling samtools sort for " + bwa_filename + " storing in " + bwa_sorted_filename) pysam.sort("-o", bwa_sorted_filename, "-n", "-@", str(threads), bwa_filename) else: copyfile(bwa_filename, bwa_sorted_filename) print(time.ctime() + " merging " + bwa1_sorted_filename + " and " + bwa2_sorted_filename) pysam.merge("-n", "-f", combined_bwa_filename, bwa1_sorted_filename, bwa2_sorted_filename) print(time.ctime() + " filtering and pairing reads") filter_pair_reads(combined_bwa_filename, mapq, paired_filename, qc_filename) print(time.ctime() + " paired bam file generated. Sorting by coordinates.") pysam.sort("-o", paired_filename + ".srt.bam", "-@", str(threads), paired_filename + ".bam") print(time.ctime() + " calling samtools rmdup") pysam.rmdup(paired_filename + ".srt.bam", paired_filename + ".rmdup.bam") #proc = subprocess.Popen(["samtools", "rmdup", paired_filename + ".srt.bam", paired_filename + ".rmdup.bam"]) #proc.communicate() print(time.ctime() + " calling samtools flagstat on mapped file") proc = subprocess.Popen("samtools flagstat " + paired_filename + ".srt.bam > " + paired_filename + ".srt.bam.flagstat", shell=True) proc.communicate() with open(paired_filename + ".srt.bam.flagstat") as flag_file: lines = flag_file.readlines() uniquely_mapped_count = lines[7].split()[0] print(time.ctime() + " calling samtools flagstat on mapped and duplicate-removed file") proc = subprocess.Popen("samtools flagstat " + paired_filename + ".rmdup.bam > " + paired_filename + ".rmdup.flagstat", shell=True) proc.communicate() with open(paired_filename + ".rmdup.flagstat") as flag_file: lines = flag_file.readlines() duprmd_count = lines[7].split()[0] intra_count = lines[11].split()[0] intra_count = str(int(float(intra_count)) / 2) print(time.ctime() + " calling samtools sort for sorting by query names") #pysam.sort("-n", "-o", bwa_filename + ".srtn.rmdup.bam", paired_filename + ".rmdup.bam") pysam.sort("-o", paired_filename + ".srtn.rmdup.bam", "-@", str(threads), "-n", paired_filename + ".rmdup.bam") #proc.communicate() #proc.wait() print(time.ctime() + " finishing filtering") qc_filename = outdir + "/" + prefix + ".feather.qc" with open(qc_filename, 'w') as outfile: outfile.write("{0:70} {1}".format("number of sequencing pairs", str(read_count))) outfile.write("{0:70} {1} ".format( "number of unqiuely mapped pairs (MAPQ >= " + str(mapq) + ")", str(uniquely_mapped_count))) outfile.write("\t({0:.2f}%)\n".format( 100 * (int(float(uniquely_mapped_count)) / int(float(read_count))))) outfile.write("{0:70} {1} ".format( "number of pairs after duplicate removal", str(duprmd_count))) outfile.write("\t({0:.2f}%)\n".format( 100 * (int(float(duprmd_count)) / int(float(read_count))))) #outfile.write("{0:70} {1} ".format("number of interchromosomal pairs", str(intra_count))) #outfile.write("\t({0:.2f}%)\n".format(100 * int(float(intra_count)) / int(float(read_count)))) return (paired_filename + ".srtn.rmdup.bam")