def reads_replace(bam_file, total_chosen_reads, seqer, flow_order, lib_key, barcode, tag, out_dir, aligner, aligner_index, is_single): bam = pysam.AlignmentFile(bam_file) edit_bam_reads = {} for read in bam.fetch(): read_name = read.query_name if read_name in total_chosen_reads: strand = getReadStrand(read) if read_name not in edit_bam_reads: edit_bam_reads[read_name] = {} if strand in total_chosen_reads[read_name]: my_read = total_chosen_reads[read_name][strand] read.query_sequence = my_read.query_sequence read.query_qualities = my_read.query_qualities if seqer == "life": read = deal_life_reads(read, flow_order, lib_key, barcode) if tag: read = add_tag(read) edit_bam_reads[read_name][strand] = read else: edit_bam_reads[read_name][strand] = read # write edited reads into edit.bam edit_bam_file = os.path.join(out_dir, "edit.bam") edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam) for read_name, readInfo in edit_bam_reads.items(): for strand, read in readInfo.items(): edit_bam.write(read) edit_bam.close() # write not edited reads into exclude.bam exclude_bam_file = os.path.join(out_dir, "exclude.bam") exclude_bam = pysam.AlignmentFile(exclude_bam_file, 'wb', template=bam) for read in bam.fetch(): read_name = read.query_name if read_name not in edit_bam_reads: exclude_bam.write(read) exclude_bam.close() # remap the edited reads header = os.path.join(out_dir, 'bam.header') os.system('samtools view -H %s|grep "^@RG" > %s' % (bam_file, header)) head = open(header, 'r').readline().rstrip() if not head: head = None edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam") remap(aligner_index, edit_bam_file, edit_remap_bam_file, aligner, is_single, header=head) edit_remap_bam_sorted_prefix = os.path.join(out_dir, "edit.remap.sort") edit_remap_bam_sorted_file = os.path.join(out_dir, "edit.remap.sort.bam") bamSort(edit_remap_bam_file, edit_remap_bam_sorted_prefix) bamIndex(edit_remap_bam_sorted_file) if tag: edit_remap_addtag_file = os.path.join(out_dir, "edit.remap.sort.bam") bam_add_tag(edit_remap_bam_sorted_file, edit_remap_addtag_file) else: edit_remap_addtag_file = edit_remap_bam_sorted_file return edit_remap_addtag_file, exclude_bam_file
def deal_haplotype(bam_file, haplotype, reffasta, haplotype_prefix, mindepth, minmutreads, minmapq, diffcover, is_single, is_multmapfilter, aligner, aligner_index, **kwargs): reads_dict = OrderedDict() bam = pysam.AlignmentFile(bam_file, 'rb') reads = bam.fetch(reference=haplotype.chrom, start=haplotype.start, end=haplotype.end + 1) depth = 0 for read in reads: depth += 1 if read.reference_start is not None and not read.is_secondary and bin( read.flag & 2048) != bin(2048): if read.query_name not in reads_dict: reads_dict[read.query_name] = {} strand = getReadStrand(read) reads_dict[read.query_name][strand] = read # judge depth and mut reads whether qualified if depth < int(mindepth): print "depth less than min depth!" return False, "haplotype in position %s:%s-%s: depth less than min depth(%s)" % ( haplotype.chrom, haplotype.start, haplotype.end, mindepth) else: mut_reads_num = int(depth * haplotype.freq) if mut_reads_num < int(minmutreads): print "mutation reads num less than minmutreads!" return False, "haplotype in position %s:%s-%s: mut reads less than min mut reads(%s)" % ( haplotype.chrom, haplotype.start, haplotype.end, minmutreads) print "start pick reads" # print str(haplotype) res = pick_reads(bam, reads_dict, mut_reads_num, is_single, minmapq, is_multmapfilter) if res[0] is False: return False, "haplotype in position %s:%s-%s: %s" % ( haplotype.chrom, haplotype.start, haplotype.end, res[1]) chosen_reads, mate_reads = res print "end pick reads" # edit my_chosen_reads = {} my_mate_reads = {} tmp_bam_file = haplotype_prefix + ".chosen.edited.bam" tmp_bam = pysam.AlignmentFile(tmp_bam_file, 'wb', template=bam) chosen_reads_num = 0 real_mut_reads_num = 0 for readName, readInfo in chosen_reads.items(): my_chosen_reads[readName] = {} tmp_dict = {} tmp_dict2 = {} for strand, read in readInfo.items(): my_read = Read(read) res = editRead(my_read, reffasta, haplotype.mutList) if res is False: continue real_mut_reads_num += 1 sequence, quality, shift = res read.query_sequence = sequence read.query_qualities = quality tmp_dict[strand] = my_read tmp_dict2[strand] = read if is_single: for strand in tmp_dict: my_chosen_reads[readName][strand] = tmp_dict[strand] tmp_bam.write(tmp_dict2[strand]) chosen_reads_num += 1 else: if len(tmp_dict) == 0: continue elif len(tmp_dict) == 1 and readName in mate_reads: for strand in tmp_dict: my_chosen_reads[readName][strand] = tmp_dict[strand] tmp_bam.write(tmp_dict2[strand]) chosen_reads_num += 1 mate_read = mate_reads[readName] my_mate_reads[readName] = Read(mate_read) tmp_bam.write(mate_read) elif len(tmp_dict) == 2: for strand in tmp_dict: my_chosen_reads[readName][strand] = tmp_dict[strand] tmp_bam.write(tmp_dict2[strand]) chosen_reads_num += 1 tmp_bam.close() # alignment and judge coverdiff whether qualified chosen_bam_file = haplotype_prefix + ".chosen.remap.bam" genome_index = aligner_index remap(genome_index, tmp_bam_file, chosen_bam_file, aligner, is_single) chosen_bam = pysam.AlignmentFile(chosen_bam_file) if judge_coverdiff(bam, depth, chosen_bam, chosen_reads_num, haplotype, float(diffcover)): return my_chosen_reads, my_mate_reads, real_mut_reads_num, depth else: return False, "haplotype in position %s:%s-%s: coverdiff is less than minDiffCover" % ( haplotype.chrom, haplotype.start, haplotype.end)
def merge_edit_bam(bam_file, out_dir, is_single, total_modify_reads_file, total_add_reads_file, used_bam_file, total_modify_list, total_add_list, seqer, aligner, aligner_index, flow_order, lib_key, barcode, tag): bam = pysam.AlignmentFile(bam_file, 'rb') edit_bam_file = os.path.join(out_dir, "edit.bam") edit_bam = pysam.AlignmentFile(edit_bam_file, 'wb', template=bam) readname_convert_file = os.path.join(out_dir, "readname_convert.txt") fout_convert = open(readname_convert_file, 'w') # edit_bam_reads = {} used_bam = pysam.AlignmentFile(used_bam_file, 'rb') used_reads = {} for read in used_bam.fetch(): keyname = getKeyName(read) used_reads[keyname] = read used_bam.close() # modify_read_name_dict = get_newname_dict(total_modify_list) # add_read_name_dict = get_newname_dict(total_add_list) modify_reads_seq, modify_reads_quan = get_sequence_dict( total_modify_reads_file) add_reads_seq, add_reads_quan = get_sequence_dict(total_add_reads_file) if is_single: with open(total_modify_list) as fin: for line in fin: if not line: break data = line.strip().split(",") read1_name = data[0] if read1_name not in used_reads: continue orig_read1 = used_reads[read1_name] new_read1 = copy.deepcopy(orig_read1) new_read1.query_sequence = modify_reads_seq[read1_name] new_read1.query_qualities = modify_reads_quan[read1_name] new_name = get_new_readname() new_read1.query_name = new_name if seqer == "life": new_read1 = deal_life_reads(new_read1, flow_order, lib_key, barcode) fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) edit_bam.write(new_read1) fin.close() with open(total_add_list) as fin: for line in fin: if not line: break data = line.strip().split(",") read1_name = data[0] if read1_name not in used_reads: continue orig_read1 = used_reads[read1_name] new_read1 = copy.deepcopy(orig_read1) new_read1.query_sequence = add_reads_seq[read1_name] new_read1.query_qualities = add_reads_quan[read1_name] new_name = get_new_readname() new_read1.query_name = new_name fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) edit_bam.write(new_read1) fin.close() else: with open(total_modify_list) as fin: for line in fin: if not line: break data = line.strip().split(",") read1_name, read2_name = data[0], data[1] if read1_name not in used_reads or read2_name not in used_reads: continue orig_read1 = used_reads[read1_name] orig_read2 = used_reads[read2_name] new_read1 = copy.deepcopy(orig_read1) new_read2 = copy.deepcopy(orig_read2) print read1_name new_read1.query_sequence = modify_reads_seq[read1_name] new_read1.query_qualities = modify_reads_quan[read1_name] new_read2.query_sequence = modify_reads_seq[read2_name] new_read2.query_qualities = modify_reads_quan[read2_name] new_name = get_new_readname() new_read1.query_name = new_name new_read2.query_name = new_name fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read2.query_name, new_read2.is_read1, new_read2.reference_start, new_read2.reference_end)) edit_bam.write(new_read1) edit_bam.write(new_read2) fin.close() with open(total_add_list) as fin: for line in fin: if not line: break data = line.strip().split(",") read1_name, read2_name = data[0], data[1] if read1_name not in used_reads or read2_name not in used_reads: continue orig_read1 = used_reads[read1_name] orig_read2 = used_reads[read2_name] new_read1 = copy.deepcopy(orig_read1) new_read2 = copy.deepcopy(orig_read2) new_read1.query_sequence = add_reads_seq[read1_name] new_read1.query_qualities = add_reads_quan[read1_name] new_read2.query_sequence = add_reads_seq[read2_name] new_read2.query_qualities = add_reads_quan[read2_name] new_name = get_new_readname() new_read1.query_name = new_name new_read2.query_name = new_name fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read1.query_name, new_read1.is_read1, new_read1.reference_start, new_read1.reference_end)) fout_convert.write( "%s: %s, %s, %s-%s\n" % (new_name, orig_read2.query_name, new_read2.is_read1, new_read2.reference_start, new_read2.reference_end)) edit_bam.write(new_read1) edit_bam.write(new_read2) fin.close() edit_bam.close() header = os.path.join(out_dir, 'bam.header') os.system('samtools view -H %s|grep "^@RG" > %s' % (bam_file, header)) head = open(header, 'r').readline().rstrip().replace('\t', '\\t') if not head: head = None edit_remap_bam_file = os.path.join(out_dir, "edit.remap.bam") remap(aligner_index, edit_bam_file, edit_remap_bam_file, aligner, is_single, head) edit_remap_bam_sorted_prefix = os.path.join(out_dir, "edit.remap.sort") edit_remap_bam_sorted_file = os.path.join(out_dir, "edit.remap.sort.bam") bamSort(edit_remap_bam_file, edit_remap_bam_sorted_prefix) return edit_remap_bam_sorted_file