def multi_to_fastq(sample_name, multi_bam, paired_end, alignment_file, threads): algn_parent = Path(alignment_file).parent srt_multi_bam = Path(algn_parent, "_sorted.multi.bam") out_SE = Path(algn_parent, sample_name + "_multimap.fastq") out_R1 = Path(algn_parent, sample_name + "_multimap_R1.fastq") out_R2 = Path(algn_parent, sample_name + "_multimap_R2.fastq") if not paired_end: print(f"Writing multi-mapped reads to {out_SE}...") bamfile = BedTool(multi_bam) BedTool.bam_to_fastq(bamfile, fq=out_SE) else: print("Sorting multi-mapped reads...") pysam.sort("--threads", f"{threads}", "-n", "-o", f"{srt_multi_bam}", f"{multi_bam}") sorted_bamfile = BedTool(srt_multi_bam) print(f"Writing multi-mapped reads to {out_R1} {out_R2}...") BedTool.bam_to_fastq(sorted_bamfile, fq=out_R1, fq2=out_R2)
def remap(fq, genome_index_path, genome_index_name, snp_index_path, \ snp_index_name, threads, out_dir, snp_tolerance, keep_temp, mismatches): # remapping of multi and unmapped reads from round 1 # generate fastq from unmaped reads nomap = out_dir + fq.split("/")[-1].split(".")[0] + ".nomapping.bam" nomap_bed = BedTool(nomap) nomap_fastq = "".join(nomap.split(".bam")[:-1]) + ".fastq" nomap_bed.bam_to_fastq(fq=nomap_fastq) # generate fastq from multi-mapping reads # first generate sam of primary alignments from multi-mappers multi = out_dir + fq.split("/")[-1].split(".")[0] + ".unpaired_mult.bam" cmd = "samtools view -@ " + str( threads) + " -F 0x904 -o " + out_dir + "temp_multi.bam " + multi subprocess.call(cmd, shell=True) multi_bed = BedTool(out_dir + "temp_multi.bam") multi_fastq = "".join(multi.split(".bam")[:-1]) + ".fastq" multi_bed.bam_to_fastq(fq=multi_fastq) os.remove(out_dir + "temp_multi.bam") if not keep_temp: os.remove(nomap) os.remove(multi) if not mismatches == None: mismatch_string = "--max-mismatches " + str(mismatches) + " " elif mismatches == None: mismatch_string = "" output_prefix = fq.split("/")[-1].split(".")[0] + "_remap" if snp_tolerance: map_cmd = "gsnap -D " + genome_index_path + " -d " + genome_index_name + " -V " + snp_index_path + " -v " \ + snp_index_name + " -t " + str(threads) + " --split-output " + out_dir + output_prefix + \ " --format=sam --genome-unk-mismatch=0 --md-lowercase-snp --ignore-trim-in-filtering 1 --force-single-end " + mismatch_string +\ nomap_fastq + " " + multi_fastq + " &>> " + out_dir + "remap_align.log" else: map_cmd = "gsnap -D " + genome_index_path + " -d " + genome_index_name + " -t " + str(threads) + \ " --split-output " + out_dir + output_prefix + " --format=sam --genome-unk-mismatch=0 --md-lowercase-snp --ignore-trim-in-filtering 1 --force-single-end " + mismatch_string + " " + \ nomap_fastq + " " + multi_fastq + " &>> " + out_dir + "remap_align.log" log.info( "Realigning multi-mapped and unmapped reads to {} with updated SNP index..." .format(genome_index_name)) subprocess.call(map_cmd, shell=True) os.remove(multi_fastq) os.remove(nomap_fastq) # remove transloc sam output if no reads present (often the case) cmd = "samtools view -c " + out_dir + output_prefix + ".unpaired_transloc" readcount = int(subprocess.check_output(cmd, shell=True)) if readcount == 0: os.remove(out_dir + output_prefix + ".unpaired_transloc") # write mapping stats and compress to bam - remove mutlimapping and unmapped unless keep_temp = True log.info("Compressing SAM files, sorting, and computing mapping stats...") with open(out_dir + "mapping_stats.txt", "a") as stats_out: align_pathlist = Path(out_dir).glob(output_prefix + "*") for file in align_pathlist: if re.search("mult", file.name) and not re.search("bam", file.name): cmd = "samtools view -@ " + str( threads) + " -F 0x904 -c " + out_dir + file.name multi_count = int(subprocess.check_output(cmd, shell=True)) if keep_temp: cmd = "samtools view -@ " + str( threads ) + " -bh -o " + out_dir + file.name + ".bam " + out_dir + file.name subprocess.call(cmd, shell=True) os.remove(out_dir + file.name) else: os.remove(out_dir + file.name) elif re.search("uniq", file.name) and not re.search("bam", file.name): cmd = "samtools view -@ " + str( threads ) + " -bh " + out_dir + file.name + " | samtools sort -@ " + str( threads) + " -o " + out_dir + file.name + ".bam" + " - " subprocess.call(cmd, shell=True) os.remove(out_dir + file.name) # merge 1st run bam and remapped bam merged_bam = out_dir + fq.split("/")[-1].split( ".")[0] + ".unpaired_uniq_remapMerge.bam" cmd = "samtools merge " + merged_bam + " " + out_dir + file.name + ".bam " + out_dir + fq.split( "/")[-1].split(".")[0] + ".unpaired_uniq.bam" subprocess.call(cmd, shell=True) #os.remove(out_dir + file.name + ".bam") cmd = "samtools view -@ " + str(threads) + " -c " + merged_bam unique_count = int(subprocess.check_output(cmd, shell=True)) unique_bam = merged_bam elif re.search("nomapping", file.name) and not re.search("bam", file.name): cmd = "samtools view -@ " + str( threads) + " -c " + out_dir + file.name unmapped_count = int(subprocess.check_output(cmd, shell=True)) if keep_temp: cmd = "samtools view -@ " + str( threads ) + " -bh -o " + out_dir + file.name + ".bam " + out_dir + file.name subprocess.call(cmd, shell=True) os.remove(out_dir + file.name) else: os.remove(out_dir + file.name) total_count = unique_count + multi_count + unmapped_count stats_out.write("{}\nUniquely mapped reads: {:d} ({:.0%}) \nMulti-mapping reads: {:d} ({:.0%}) \nUnmapped reads: {:d} ({:.0%}) \nTotal: {:d}\n\n"\ .format(fq.split("/")[-1], unique_count, (unique_count/total_count),multi_count, (multi_count/total_count), unmapped_count, (unmapped_count/total_count), total_count)) alignstats_dict = defaultdict(list) type_list = ["Uniquely mapped", "Multi-mapped", "Unmapped"] for i, count in enumerate([unique_count, multi_count, unmapped_count]): alignstats_dict["Lib"].append(fq.split("/")[-1].split(".")[0]) alignstats_dict["Type"].append(type_list[i]) alignstats_dict["Count"].append(count) return (unique_bam, unique_count, alignstats_dict)