def write_sub_bam(chrom_list, used_bam_file_tmp, exclude_bam_file_tmp, out_dir, total_modify_reads_file, total_delete_reads_file, total_add_reads_file, process): write_bam_pool = Pool(int(process)) exclude_bam_list = [exclude_bam_file_tmp] usedBamList = [] for chrom in chrom_list: excludeBam_chr = "%s/exclude_%s.bam" % (out_dir, chrom) exclude_bam_list.append(excludeBam_chr) usedBam_chr = "%s/used_%s.bam" % (out_dir, chrom) usedBamList.append(usedBam_chr) write_bam_pool.apply_async( write_bam_byChr, args=(used_bam_file_tmp, chrom, excludeBam_chr, usedBam_chr, total_modify_reads_file, total_delete_reads_file, total_add_reads_file)) write_bam_pool.close() write_bam_pool.join() exclude_bam_file = os.path.join(out_dir, "exclude.bam") bamMerge(exclude_bam_list, exclude_bam_file) used_bam_file = os.path.join(out_dir, "used.bam") if len(usedBamList) != 1: bamMerge(usedBamList, used_bam_file) else: used_bam_file = usedBamList[0] bamSort(used_bam_file, os.path.join(out_dir, "used.sort")) used_sort_bam_file = os.path.join(out_dir, "used.sort.bam") bamIndex(used_sort_bam_file) return used_sort_bam_file, exclude_bam_file
def get_reads_by_region(bam_file, sv_list, out_dir): # get reads by region bed region_bed_file = os.path.join(out_dir, "consider_region.bed") chrom_list = write_region_bed(region_bed_file, sv_list) exclude_bam_file_tmp = os.path.join(out_dir, "exclude_tmp.bam") used_bam_file_tmp = os.path.join(out_dir, "used_tmp.bam") getRegionReads(bam_file, region_bed_file, used_bam_file_tmp, exclude_bam_file_tmp) bamIndex(used_bam_file_tmp) return chrom_list, used_bam_file_tmp, exclude_bam_file_tmp
def main(run_args): start_time = time.asctime(time.localtime(time.time())) # print start_time temp_out_dir = os.path.join(run_args.outdir, "tempDir") os.system("mkdir -p %s" % temp_out_dir) invalid_log_file = os.path.join(run_args.outdir, 'invalid_mutation.txt') invalid_log = InvalidLog(invalid_log_file) # step1: deal with mutfile and get haplotypes print "step1: deal with mutfile and get haplotypes" haplotype_list = get_haplotypes(run_args.bamfile, run_args.reffasta, run_args.mutfile, int(run_args.haplosize), float(run_args.snpfrac), invalid_log) # step2: deal haplotypes and get total_chosen_reads, total_chosen_reads_muts print "step2: deal haplotypes and get total_chosen_reads, total_chosen_reads_muts" success_list_file = os.path.join(run_args.outdir, 'success_list.txt') total_chosen_reads, total_chosen_reads_muts = deal_haplotype_multi( run_args.bamfile, haplotype_list, temp_out_dir, run_args.reffasta, int(run_args.process), int(run_args.mindepth), int(run_args.minmutreads), int(run_args.minmapq), float(run_args.diffcover), run_args.single, run_args.multmapfilter, run_args.aligner, run_args.alignerIndex, invalid_log, success_list_file) invalid_log.close() if len(total_chosen_reads) == 0: print "Warning: No reads to deal with of all these sv, checkout your sv file" return # step3: modify the reads in total_chosen_reads itself print "step3: modify the reads in total_chosen_reads itself" reads_modify(total_chosen_reads, total_chosen_reads_muts, run_args.reffasta, int(run_args.process)) # step4: write edited reads to edited file and exclude reads to exclude file ,than remap edited file to reference print "step4: write edited reads to edited file and exclude reads to exclude file ,than remap edited file to reference" edit_remap_bam_file, exclude_bam_file = reads_replace( run_args.bamfile, total_chosen_reads, run_args.seqer, run_args.floworder, run_args.libkey, run_args.barcode, run_args.tag, temp_out_dir, run_args.aligner, run_args.alignerIndex, run_args.single) # step5: merge remap.edit.bam and exclude exclude.bam and sort print "step5: merge remap.edit.bam and exclude exclude.bam and sort" # edit_remap_bam_file, exclude_bam_file = os.path.join(temp_out_dir, "edit.remap.sort.bam"), os.path.join( # temp_out_dir, "exclude.bam") out_bam_file = os.path.join(run_args.outdir, "edit.sorted.bam") bamMerge([edit_remap_bam_file, exclude_bam_file], out_bam_file) bamIndex(out_bam_file) end_time = time.asctime(time.localtime(time.time())) # speed_time = end_time - start_time print "Edit Bam is completed! Result see %s and valid mutation see %s. Invalid mutation can't be spike in see %s." % ( out_bam_file, success_list_file, invalid_log_file)
def main(run_args): invalid_log_file = os.path.join(run_args.outdir, 'invalid_mutation.txt') invalid_log = log(invalid_log_file) temp_out_dir = os.path.join(run_args.out_dir, "tempDir") os.system("mkdir -p %s" % temp_out_dir) # step1: deal with mutfile and get haplotypes haplotype_list = get_haplotypes(run_args.bamfile, run_args.reffasta, run_args.mutfile, int(run_args.haplosize), float(run_args.snpfrac), invalid_log) # step2: deal haplotypes and get total_chosen_reads, total_chosen_reads_muts total_chosen_reads, total_chosen_reads_muts = deal_haplotype_multi(run_args.bam_file, haplotype_list, temp_out_dir, run_args.reffasta, int(run_args.process), int(run_args.mindepth), int(run_args.minmutreads), int(run_args.minmapq), float(run_args.diffcover), run_args.single, run_args.is_multmapfilter, run_args.aligner, run_args.alignerIndex, invalid_log) # step3: modify the reads in total_chosen_reads itself reads_modify(total_chosen_reads, total_chosen_reads_muts, run_args.reffasta, int(run_args.prceoss)) # step4: write edited reads to edited file and exclude reads to exclude file ,than remap edited file to reference edit_remap_bam_file, exclude_bam_file = reads_replace(run_args.bam_file, total_chosen_reads, run_args.seqer, run_args.floworder, run_args.lib_key, run_args.barcode, run_args.tag, temp_out_dir, run_args.aligner, run_args.aligner_index, run_args.single) # step5: merge remap.edit.bam and exclude exclude.bam and sort bamIndex(exclude_bam_file) out_bam_file = os.path.join(temp_out_dir, "edit_exclude.bam") bamMerge([edit_remap_bam_file, exclude_bam_file], out_bam_file) out_sort_bam_file = os.path.join(run_args.outdir, "edit.sort.bam") out_sort_bam_file_prefix = os.path.join(run_args.outdir, "edit.sort") bamSort(out_bam_file, out_sort_bam_file_prefix) bamIndex(out_sort_bam_file) print "Edit Bam is completed! Result see %s and invalid mutation can't be spike in see %s." % ( out_sort_bam_file, invalid_log_file)
def write_sub_bam(chrom_list, used_bam_file_tmp, exclude_bam_file_tmp, out_dir, total_modify_readname_list, total_delete_readname_list, total_add_readname_list, process): write_bam_pool = Pool(int(process)) exclude_bam_list = [exclude_bam_file_tmp] usedBamList = [] for chrom in chrom_list: excludeBam_chr = "%s/exclude_%s.bam" % (out_dir, chrom) exclude_bam_list.append(excludeBam_chr) usedBam_chr = "%s/used_%s.bam" % (out_dir, chrom) usedBamList.append(usedBam_chr) write_bam_pool.apply_async( write_bam_byChr, args=(used_bam_file_tmp, chrom, excludeBam_chr, usedBam_chr, total_modify_readname_list, total_delete_readname_list, total_add_readname_list)) write_bam_pool.close() write_bam_pool.join() exclude_bam_file = os.path.join(out_dir, "exclude.bam") bamMerge(exclude_bam_list, exclude_bam_file) used_bam_file = os.path.join(out_dir, "used.bam") if len(usedBamList) != 1: bamMerge(usedBamList, used_bam_file) else: used_bam_file = usedBamList[0] bamSort(used_bam_file, os.path.join(out_dir, "used.sort")) used_sort_bam_file = os.path.join(out_dir, "used.sort.bam") bamIndex(used_sort_bam_file) used_bam = pysam.AlignmentFile(used_sort_bam_file, 'rb') used_reads = {} for read in used_bam.fetch(): keyname = getKeyName(read) used_reads[keyname] = read used_bam.close() return used_reads, used_bam_file, exclude_bam_file
def main(run_args): start_time = time.asctime(time.localtime(time.time())) print start_time if not os.path.exists(run_args.outdir): os.mkdir(run_args.outdir) invalid_log_file = os.path.join(run_args.outdir, 'invalid_mutation.txt') invalid_log = InvalidLog(invalid_log_file) run_log_file = os.path.join(run_args.outdir, 'run.log') run_log = RunLog(run_log_file) temp_out_dir = os.path.join(run_args.outdir, "tempDir") if not os.path.exists(temp_out_dir): os.mkdir(temp_out_dir) # step0: prepare sv list sv_list = check_sv_file(run_args.svfile, run_args.reffasta, invalid_log) if run_args.debug: print len(sv_list) if not sv_list: exit("no sv list to deal with") # step1: get insert size of paired reads insert_size = get_insertSize_range(run_args.bamfile, run_args.readlength, run_args.single, run_args.debug) if run_args.debug: print insert_size # step2: deal with sv total_modify_reads, total_delete_reads, total_add_reads = deal_sv( run_args.bamfile, run_args.reffasta, sv_list, run_args.single, int(run_args.minmapq), run_args.multmapfilter, int(run_args.mindepth), int(run_args.minmutreads), int(run_args.readlength), temp_out_dir, insert_size, invalid_log, run_log) # exit total_deal_reads_num = len(total_modify_reads) + len( total_delete_reads) + len(total_add_reads) if total_deal_reads_num == 0: # run_log.info("No reads to deal with of all these sv, check out your sv file") print "Warning: No reads to deal with of all these sv, checkout your sv file" return if run_args.debug: print len(total_modify_reads), len(total_delete_reads), len( total_add_reads) # step3: merge edit reads total_modify_readname_list, total_delete_readname_list, total_add_readname_list = merge_edit_reads( total_modify_reads, total_add_reads, total_delete_reads) if run_args.debug: print "list num: ", len(total_modify_readname_list), len( total_delete_readname_list), len(total_add_readname_list) # step4: get reads by region bed and write bam file chrom_list, used_bam_file_tmp, exclude_bam_file_tmp = get_reads_by_region( run_args.bamfile, sv_list, temp_out_dir) # write reads which may probably used to used.bam and reads should not be used to exclude.bam used_reads, used_bam_file, exclude_bam_file = write_sub_bam( chrom_list, used_bam_file_tmp, exclude_bam_file_tmp, temp_out_dir, total_modify_readname_list, total_delete_readname_list, total_add_readname_list, int(run_args.process)) if run_args.debug: print "used & exclude bam:", used_bam_file, exclude_bam_file # step5: merge edited reads and remap to new bam, consider about the tag, RG, life reads edit_remap_bam_file = merge_edit_bam( run_args.bamfile, temp_out_dir, run_args.single, total_modify_reads, total_add_reads, used_reads, run_args.seqer, run_args.aligner, run_args.aligner_index, run_args.flow_order, run_args.lib_key, run_args.barcode, run_args.tag) if run_args.debug: print "edit remap bam:", edit_remap_bam_file # step6: read remapped edit reads to dict out_bam_file = os.path.join(temp_out_dir, "edit_exclude.bam") bamMerge([edit_remap_bam_file, exclude_bam_file], out_bam_file) bamIndex(out_bam_file) out_sort_bam_file = os.path.join(run_args.outdir, "edit.sort.bam") out_sort_bam_file_prefix = os.path.join(run_args.outdir, "edit.sort") bamSort(out_bam_file, out_sort_bam_file_prefix) bamIndex(out_sort_bam_file) end_time = time.asctime(time.localtime(time.time())) print end_time
def main(run_args): start_time = time.asctime(time.localtime(time.time())) # print start_time if not os.path.exists(run_args.outdir): os.mkdir(run_args.outdir) invalid_log_file = os.path.join(run_args.outdir, 'invalid_mutation.txt') invalid_log = InvalidLog(invalid_log_file) run_log_file = os.path.join(run_args.outdir, 'run.log') run_log = RunLog(run_log_file) temp_out_dir = os.path.join(run_args.outdir, "tempDir") if not os.path.exists(temp_out_dir): os.mkdir(temp_out_dir) # step0: prepare sv list sv_list = check_sv_file(run_args.svfile, run_args.reffasta, invalid_log) if not sv_list: exit("no sv list to deal with") # step1: get insert size of paired reads print "step1: get insert size of paired reads" insert_size = get_insertSize_range(run_args.bamfile, run_args.readlength, run_args.single) # step2: deal with sv and get total edited reads print "step2: deal with sv and get total edited reads" success_file = os.path.join(run_args.outdir, 'success_list.txt') total_modify_reads_file, total_delete_reads_file, total_add_reads_file, total_modify_list, total_delete_list, total_add_list = deal_sv( run_args.bamfile, run_args.reffasta, sv_list, run_args.single, int(run_args.minmapq), run_args.multmapfilter, int(run_args.mindepth), int(run_args.minmutreads), int(run_args.readlength), temp_out_dir, insert_size, invalid_log, run_log, success_file) invalid_log.close() # step3: get reads by region bed and write bam file print "step3: get reads by region bed and write bam file" chrom_list, used_bam_file_tmp, exclude_bam_file_tmp = get_reads_by_region( run_args.bamfile, sv_list, temp_out_dir) # write reads which may probably used to used.bam and reads should not be used to exclude.bam used_bam_file, exclude_bam_file = write_sub_bam( chrom_list, used_bam_file_tmp, exclude_bam_file_tmp, temp_out_dir, total_modify_reads_file, total_delete_reads_file, total_add_reads_file, int(run_args.process)) # step4: merge edited reads and remap to new bam, consider about the tag, RG, life reads print "step4: merge edited reads and remap to new bam, consider about the tag, RG, life reads" edit_remap_bam_file = merge_edit_bam( run_args.bamfile, temp_out_dir, run_args.single, total_modify_reads_file, total_add_reads_file, used_bam_file, total_modify_list, total_add_list, run_args.seqer, run_args.aligner, run_args.alignerIndex, run_args.floworder, run_args.libkey, run_args.barcode, run_args.tag) # step5: remapped edit reads and merge print "step5: remapped edit reads and merge" out_bam_file = os.path.join(run_args.outdir, "edit.sorted.bam") bamMerge([edit_remap_bam_file, exclude_bam_file], out_bam_file) bamIndex(out_bam_file) end_time = time.asctime(time.localtime(time.time())) # print end_time # speed_time = end_time - start_time print "Edit Bam is completed! Result see %s and valid mutation see %s. Invalid mutation can't be spike in see %s." % ( out_bam_file, success_file, invalid_log_file)
def merge_edit_bam(bam_file, out_dir, is_single, total_modify_reads, total_add_reads, used_reads, seqer, aligner, aligner_index, flow_order, lib_key, barcode, tag): bam = pysam.AlignmentFile(bam_file, 'rb') edit_bam_file = os.path.join(out_dir, "edit.bam") edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam) readname_convert_file = os.path.join(out_dir, "readname_convert.txt") fout_convert = open(readname_convert_file, 'w') edit_bam_reads = {} if is_single: for read_pair in total_modify_reads: read1 = read_pair[0] keyname_read1 = getKeyName(read1) orig_read1 = used_reads[keyname_read1] new_read1 = copy.deepcopy(orig_read1) new_read1.query_sequence = read1.query_sequence new_read1.query_qualities = read1.query_qualities new_name = read1.query_name.split( ":")[0] + ":" + get_new_readname() new_read1.query_name = new_name if seqer == "life": new_read1 = deal_life_reads(new_read1, flow_order, lib_key, barcode) if tag: new_read1 = add_tag(new_read1) edit_bam.write(new_read1) fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) strand = getReadStrand(new_read1) if new_name not in edit_bam_reads: edit_bam_reads[new_name] = dict() edit_bam_reads[new_name][strand] = new_read1 for read_pair in total_add_reads: read1 = read_pair[0] keyname_read1 = getKeyName(read1) orig_read1 = used_reads[keyname_read1] new_read1 = copy.deepcopy(orig_read1) new_name = get_new_readname() new_read1.query_name = new_name edit_bam.write(new_read1) fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) strand = getReadStrand(new_read1) if new_name not in edit_bam_reads: edit_bam_reads[new_name] = dict() edit_bam_reads[new_name][strand] = new_read1 else: for read_pair in total_modify_reads + total_add_reads: read1 = read_pair[0] read2 = read_pair[1] keyname_read1 = getKeyName(read1) keyname_read2 = getKeyName(read2) orig_read1 = used_reads[keyname_read1] orig_read2 = used_reads[keyname_read2] orig_read1_name = orig_read1.query_name orig_read2_name = orig_read2.query_name new_read1 = copy.deepcopy(orig_read1) new_read2 = copy.deepcopy(orig_read2) new_read1.query_sequence = read1.query_sequence new_read1.query_qualities = read1.query_qualities new_read2.query_sequence = read2.query_sequence new_read2.query_qualities = read2.query_qualities new_name = get_new_readname() new_read1.query_name = new_name new_read2.query_name = new_name strand1 = getReadStrand(new_read1) strand2 = getReadStrand(new_read2) if new_name not in edit_bam_reads: edit_bam_reads[new_name] = dict() edit_bam_reads[new_name][strand1] = new_read1 edit_bam_reads[new_name][strand2] = new_read2 if tag: new_read1 = add_tag(new_read1) new_read2 = add_tag(new_read2) fout_convert.write("%s: %s, %s, %s, %s, %s-%s, %s-%s\n" % ( new_name, orig_read1_name, orig_read2_name, new_read1.is_read1, new_read2.is_read2, new_read1.reference_start, new_read1.reference_end, new_read2.reference_start, new_read2.reference_end, )) edit_bam.write(new_read1) edit_bam.write(new_read2) fout_convert.close() edit_bam.close() edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam") remap(aligner_index, edit_bam_file, edit_remap_bam_file, aligner, is_single) if not is_single: editRemap = pysam.AlignmentFile(edit_remap_bam_file, 'rb') editRemapBam_addRG_File = os.path.join(out_dir, "edit.remap.addRG.bam") bamAddRG(editRemap, edit_bam_reads, bam, editRemapBam_addRG_File) editRemap.close() else: editRemapBam_addRG_File = edit_remap_bam_file bamIndex(editRemapBam_addRG_File) return editRemapBam_addRG_File