def Loc_paired(read, loc_dict): ''' Record the location information for paired sequencing Return: loc_s = { chrid: {loc1: coverage; ...}; chrid:..} loc1: plus strand (++ or --) 5' pos + minus strand (+- or -+) 5' pos ''' # Using the read class In: paired unique mapping read Out: [flag,strand,chrom,pos1,CIGAR,pos2,insert,seq,score] read_s = RI(read) read_info = read_s.extract_information() if len(read_info) == 0: return loc_dict strand, chrom, pos, pos2, insert = read_info[1], read_info[2], read_info[ 3], read_info[5], read_info[6] # only use one mate with plus insert to record if strand == '+-' or strand == '-+': return loc_dict site = pos + '_' + str(int(pos) + int(insert) - 1) if loc_dict.has_key(chrom): if loc_dict[chrom].has_key(site): loc_dict[chrom][site] += 1 else: loc_dict[chrom][site] = 1 else: loc_dict[chrom] = {} loc_dict[chrom][site] = 1 return loc_dict
def Loc_paired(read,loc_dict): ''' Record the location information for paired sequencing Return: loc_s = { chrid: {loc1: coverage; ...}; chrid:..} loc1: plus strand (++ or --) 5' pos + minus strand (+- or -+) 5' pos ''' # Using the read class In: paired unique mapping read Out: [flag,strand,chrom,pos1,CIGAR,pos2,insert,seq,score] read_s = RI(read) read_info = read_s.extract_information() if len(read_info) == 0: return loc_dict strand, chrom, pos, pos2, insert = read_info[1], read_info[2], read_info[3], read_info[5], read_info[6] # only use one mate with plus insert to record if strand == '+-' or strand == '-+': return loc_dict site = pos + '_' + str(int(pos) + int(insert) - 1) if loc_dict.has_key(chrom): if loc_dict[chrom].has_key(site): loc_dict[chrom][site] +=1 else: loc_dict[chrom][site] = 1 else: loc_dict[chrom] = {} loc_dict[chrom][site] = 1 return loc_dict
def Loc_single(read, loc_dict): ''' Record the location information for single-end sequencing Return: loc_s = { chrid: {loc1: coverage; ...}; chrid:..} loc1: 5' pos + strand ''' # Using the read class In: single unique mapping read Out: [flag,strand,chrom,pos,CIGAR,seq,score] read_s = RI(read) read_info = read_s.extract_information() if len(read_info) == 0: return loc_dict strand, chrom, pos, seq = read_info[1], read_info[2], read_info[ 3], read_info[5] # The minus strand should be shifted len(seq) bp if strand == '-+': pos = int(pos) + len(seq) - 1 site = strand + '_' + str(pos) if loc_dict.has_key(chrom): if loc_dict[chrom].has_key(site): loc_dict[chrom][site] += 1 else: loc_dict[chrom][site] = 1 else: loc_dict[chrom] = {} loc_dict[chrom][site] = 1 return loc_dict
def Loc_single(read,loc_dict): ''' Record the location information for single-end sequencing Return: loc_s = { chrid: {loc1: coverage; ...}; chrid:..} loc1: 5' pos + strand ''' # Using the read class In: single unique mapping read Out: [flag,strand,chrom,pos,CIGAR,seq,score] read_s = RI(read) read_info = read_s.extract_information() if len(read_info) == 0: return loc_dict strand, chrom, pos, seq = read_info[1], read_info[2], read_info[3], read_info[5] # The minus strand should be shifted len(seq) bp if strand == '-+': pos = int(pos) + len(seq) -1 site = strand + '_' + str(pos) if loc_dict.has_key(chrom): if loc_dict[chrom].has_key(site): loc_dict[chrom][site] += 1 else: loc_dict[chrom][site] = 1 else: loc_dict[chrom] = {} loc_dict[chrom][site] = 1 return loc_dict
def get_read_info(self): ''' Using the read class If the read is unique mapping or unique and paired mapping, we will get a information list. If not, we will get a empty list ([]). In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] ''' read_s = RI(self.read) read_info = read_s.extract_information() return read_info
def get_read_info(self): """ Using the read class If the read is unique mapping or unique and paired mapping, we will get a information list. If not, we will get a empty list ([]). In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] """ read_s = RI(self.read, self.bsmb) read_info = read_s.extract_information() return read_info
def run(args): """ Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures """ options = args.parse_args() if len(options.sam_file) == 0: error("Missing the SAM file, use -s or --sam option.") else: options.sam_file = options.sam_file.split(',') for s in options.sam_file: if not os.path.isfile(s): error("Can't open the SAM file: " + s) sys.exit(1) if len(options.ref_file) == 0: error( "Missing the reference genome fasta file, use -r or --ref option.") else: if not os.path.isfile(options.ref_file): error("Can't open the ref file: " + options.ref_file) if len(options.samtools) != 0: if options.samtools[-1] != '/': options.samtools += '/' if len(options.name) == 0: error("Missing the output file name, use -n or --name options.") sam_inf = options.sam_file ref_file = options.ref_file bsm = options.bsm s_path = options.samtools name = options.name remove_overlap = options.remove_overlap filter_dup = options.filter_dup p_poisson = options.p_poisson gsize = options.gsize not_mapping = options.not_mapping info("Get the all parameter!!") #check the input mapping files sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path) pre_flag = read_inf.readline().split('\t')[1] if 'p' in pre_flag: single_on = False info("The input mapping files are paired-end sequencing!") else: single_on = True info("The input mapping files are single-end sequencing!") loc_dict = {} if filter_dup: ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf info("The filter_dup has been set True.") info("Assess the duplicate reads...") for sam in sam_inf: #check the input mapping files sam_format, read_inf = check.check_mapping_file(sam, s_path) if single_on: for read in read_inf: loc_dict = LI.Loc_single(read, loc_dict, bsm) else: for read in read_inf: loc_dict = LI.Loc_paired(read, loc_dict, bsm) max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name) info('Get the duplicate reads distribution!') #get reference information ref = GR.get_ref(ref_file) trim_position = [] filter_duplicate_reads = 0 filter_nonuniform_trim_bp = 0 filter_nonuniform_trim_bp_CG = 0 filter_remove_overlap_bp = 0 filter_not_mapping_reads = 0 all_reads = 0 not_mapping_reads = 0 all_mapping_bp = 0 ##filter the 5' bisulfite failure for sam in sam_inf: out_sam = sam[:-4] + '_' + name + '_filter.sam' out = open(out_sam, 'w') #check the input mapping files record_mate = {} sam_format, read_inf = check.check_mapping_file_header(sam, s_path) for read in read_inf: #for sam header if read.startswith('@'): out.write(read) continue else: all_reads += 1 ##record the read number (2013-06-20) #Get the read information for trimming #If the read isn't unique mapping, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_info = RI(read, bsm) read_info = read_info.extract_information() if len(read_info) == 0: not_mapping_reads += 1 if not_mapping: #keep the not_unique mapping reads (or not paired mapping) out.write(read) else: filter_not_mapping_reads += 1 ##record the not mapping read number (2013-06-20) continue if len( loc_dict ) > 0: #the --filter_dup has been set True, have to remove duplicate reads duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on) else: duplicate = False if single_on: all_mapping_bp += len( read_info[5] ) ##record the mapping read basepair (2013-06-20) else: all_mapping_bp += len( read_info[7] ) ##record the mapping read basepair (2013-06-20) record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter( read, out, read_info, ref, remove_overlap, duplicate, single_on, record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp) out.close() del record_mate NR.nonuniform_generator(trim_position, name) for i in range(len(trim_position)): filter_nonuniform_trim_bp += i * trim_position[i] ##produce the filter report info('Produce the report file...') report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w') report_out.write('Total reads: %d\n' % all_reads) if single_on: report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Unique mapping reads: %d(%.2f%s all reads)\n' % ((all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique mapping reads:\n') report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp) report_out.write( 'Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write( "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write( "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) else: report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Unique paired mapping reads: %d(%.2f%s)\n' % ((all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Skip not paired unique mapping reads: %d(%.2f%s)\n' % (filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique paired mapping reads:\n') report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp) report_out.write( 'Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads * 100), "%")) report_out.write( "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write( "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write( 'Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%")) report_out.close() info('Get the report file!')
def filter_sam(sam_inf, strand_t, read_l, single_on, name, s_path, auto, remove_overlap, loc_dict, max_cov, not_mapping): ''' Trim the mapping files with the biased positions of every length in every strand, which are saved in the variance: strand_t. ''' filter_duplicate_reads = 0 filter_mbias_trim_bp = 0 filter_remove_overlap_bp = 0 filter_not_mapping_reads = 0 all_reads = 0 not_mapping_reads = 0 all_mapping_bp = 0 for sam in sam_inf: out_sam = sam[:-4] + '_' + name + '_filter.sam' out = open(out_sam, 'w') #check the input mapping files sam_format, read_inf = check.check_mapping_file_header(sam, s_path) #scan every read to qc_filter for read in read_inf: #for sam header if read.startswith('@'): out.write(read) continue else: all_reads += 1 ##record the read number (2013-06-20) #Get the read information for trimming #If the read isn't unique mapping, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_info = RI(read) read_info = read_info.extract_information() if len(read_info) == 0: not_mapping_reads += 1 if not_mapping: #keep the not_unique mapping reads out.write(read) else: filter_not_mapping_reads += 1 ##record the not mapping read number (2013-06-20) continue if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on) else: duplicate = False if single_on: all_mapping_bp += len(read_info[5]) ##record the mapping read basepair (2013-06-20) if auto: if read_l[0] != '': original_length = int(read_l[sam_inf.index(sam)]) else: original_length = '' filter_mbias_trim_bp, filter_duplicate_reads = SF(read, strand_t, out, read_info, original_length, duplicate, filter_mbias_trim_bp, filter_duplicate_reads) else: if not duplicate and len(loc_dict) > 0: out.write(read) #not trimming, only output not_duplicate reads else: filter_duplicate_reads += 1 ##record the duplicate read (2013-06-20) else: all_mapping_bp += len(read_info[7]) ##record the mapping read basepair (2013-06-20) if auto or remove_overlap: if read_l[0] != '': original_length = [int(i) for i in read_l[sam_inf.index(sam)].split('-')] else: original_length = '' filter_mbias_trim_bp, filter_duplicate_reads, filter_remove_overlap_bp = PF(read, strand_t, out, read_info, original_length, auto, remove_overlap, duplicate, filter_mbias_trim_bp, filter_duplicate_reads, filter_remove_overlap_bp) else: if not duplicate and len(loc_dict) > 0: out.write(read) #not trimming, only output not_duplicate reads else: filter_duplicate_reads += 1 ##record the duplicate read (2013-06-20) out.close() ##produce the filter report info('Produce the report file...') report_out = open(name + "_BSeQC_mbias_filter_report.txt", 'w') report_out.write('Total reads: %d\n' % all_reads) if single_on: report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique mapping reads:\n') report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique mapping basepairs)\n' % ( filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%")) else: report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique paired mapping reads:\n') report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % ( filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%")) report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % ( filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%")) report_out.close() info('Get the report file!')
def record_site(read, ref, bsm, dige_site): ''' record the location of restriction enzyme digestion site in the read, and the methylation level of the restriction enzyme digestion site :param read: :param ref: :param dige_site: ''' #Using the read class #If the read is unique mapping or unique and paired mapping, we will get a information list. #If not, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_s = RI(read, bsm) read_info = read_s.extract_information() strand = '' site_meth_list = [] reverse_strand = ['-+', '+-'] if len(read_info) == 0: return read_info, site_meth_list strand, chr, pos, seq = read_info[1], read_info[2], int(read_info[3]) - 1, read_info[-2] #print strand, chr, pos, seq if len(chr.split('_')) > 1: #filter some 'chr_' chromosome return read_info, site_meth_list readlen = len(seq) refseq = ref[chr][pos - 1: pos + readlen + 1] if strand not in reverse_strand: #get the location of restriction enzyme digestion site dige_5_index = refseq.find('CCGG') dige_3_index = refseq.rfind('CCGG') else: # if the strand is '-+' (single-end) or '+-', reverse the seq and refseq seq = seq[::-1] refseq = refseq[::-1] dige_5_index = refseq.find('GGCC') dige_3_index = refseq.rfind('GGCC') if dige_3_index == -1 or dige_5_index == -1: warning("Can't find MspI site in the read: %s" %read.rstrip()) #return dige_dict return read_info, site_meth_list # get the location of the restriction enzyme digestion site #dige_dict[strand]['s'][0].append(dige_5_index + 1) #dige_dict[strand]['e'][0].append(dige_3_index + 1) # get the methylation state of the restriction enzyme digestion site if strand[1] == '-': #stand: +- or --, check the third nucleotide of C-CGG dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq, dige_5_index + 1) dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq, dige_3_index + 1) else: #stand: ++ or -+, check the second nucleotide of C-CGG dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq, dige_5_index) dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq, dige_3_index) #dige_dict[strand]['s'][1] = [sum(x) for x in zip(dige_dict[strand]['s'][1], [dige_5_meth, dige_5_all])] #dige_5_meth_list = [dige_5_meth] + [0] * 10 #dige_5_all_list = [dige_5_all] + [0] * 10 #dige_3_meth_list = [0] * 11 #dige_3_all_list = [0] * 11 #calculate the methylation states of 10 nucleotides after dige_5_index #for i in range(1, 11): #meth_5, all_5 = get_meth_state(strand, seq, refseq, dige_5_index + 2 + i) # +2 to scan the nucleotide after CCGG #meth_3, all_3 = get_meth_state(strand, seq, refseq, len(seq) - i) #dige_5_meth_list[i] = meth_5 #dige_5_all_list[i] = all_5 #dige_3_meth_list[10 - i] = meth_3 #dige_3_all_list[10 - i] = all_3 #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, len(seq)] site_meth_list = [dige_5_meth, dige_5_all, 0, 0, dige_5_index, len(seq)] if dige_3_index != 0 and dige_3_index != dige_5_index: if strand[1] == '+': #check the 3' digestion site for '++' and '-+' if len(seq[(dige_3_index - 1):(dige_3_index + 3)]) == 4: if (strand == '++' and seq[dige_3_index + 2] == 'A') or (strand == '-+' and seq[dige_3_index + 2] == 'T'): site_meth_list = [dige_5_meth, dige_5_all, dige_3_meth, dige_3_all, dige_5_index, dige_3_index] #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])] #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])] #for i in range(1, 11): # meth_3, all_3 = get_meth_state(strand, seq, refseq, dige_3_index - 1 - i) # dige_3_meth_list[10 - i] = meth_3 # dige_3_all_list[10 - i] = all_3 #dige_3_meth_list[-1] = dige_3_meth #dige_3_all_list[-1] = dige_3_all #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index] #print "end_repair_3", refseq[dige_3_index:(dige_3_index + 4)], dige_3_meth_list, dige_3_all_list else: site_meth_list = [dige_5_meth, dige_5_all, dige_3_meth, dige_3_all, dige_5_index, dige_3_index] #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])] #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])] #for i in range(1, 11): # meth_3, all_3 = get_meth_state(strand, seq,refseq, dige_3_index - 1 - i) # dige_3_meth_list[10 - i] = meth_3 # dige_3_all_list[10 - i] = all_3 #dige_3_meth_list[-1] = dige_3_meth #dige_3_all_list[-1] = dige_3_all #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index] #print 'bbbb', dige_3_meth_list, dige_3_all_list #return dige_dict return read_info, site_meth_list
def record_site(read, ref, bsm, dige_site): ''' record the location of restriction enzyme digestion site in the read, and the methylation level of the restriction enzyme digestion site :param read: :param ref: :param dige_site: ''' #Using the read class #If the read is unique mapping or unique and paired mapping, we will get a information list. #If not, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_s = RI(read, bsm) read_info = read_s.extract_information() strand = '' site_meth_list = [] reverse_strand = ['-+', '+-'] if len(read_info) == 0: return read_info, site_meth_list strand, chr, pos, seq = read_info[1], read_info[2], int( read_info[3]) - 1, read_info[-2] #print strand, chr, pos, seq if len(chr.split('_')) > 1: #filter some 'chr_' chromosome return read_info, site_meth_list readlen = len(seq) refseq = ref[chr][pos - 1:pos + readlen + 1] if strand not in reverse_strand: #get the location of restriction enzyme digestion site dige_5_index = refseq.find('CCGG') dige_3_index = refseq.rfind('CCGG') else: # if the strand is '-+' (single-end) or '+-', reverse the seq and refseq seq = seq[::-1] refseq = refseq[::-1] dige_5_index = refseq.find('GGCC') dige_3_index = refseq.rfind('GGCC') if dige_3_index == -1 or dige_5_index == -1: warning("Can't find MspI site in the read: %s" % read.rstrip()) #return dige_dict return read_info, site_meth_list # get the location of the restriction enzyme digestion site #dige_dict[strand]['s'][0].append(dige_5_index + 1) #dige_dict[strand]['e'][0].append(dige_3_index + 1) # get the methylation state of the restriction enzyme digestion site if strand[1] == '-': #stand: +- or --, check the third nucleotide of C-CGG dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq, dige_5_index + 1) dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq, dige_3_index + 1) else: #stand: ++ or -+, check the second nucleotide of C-CGG dige_5_meth, dige_5_all = get_meth_state(strand, seq, refseq, dige_5_index) dige_3_meth, dige_3_all = get_meth_state(strand, seq, refseq, dige_3_index) #dige_dict[strand]['s'][1] = [sum(x) for x in zip(dige_dict[strand]['s'][1], [dige_5_meth, dige_5_all])] #dige_5_meth_list = [dige_5_meth] + [0] * 10 #dige_5_all_list = [dige_5_all] + [0] * 10 #dige_3_meth_list = [0] * 11 #dige_3_all_list = [0] * 11 #calculate the methylation states of 10 nucleotides after dige_5_index #for i in range(1, 11): #meth_5, all_5 = get_meth_state(strand, seq, refseq, dige_5_index + 2 + i) # +2 to scan the nucleotide after CCGG #meth_3, all_3 = get_meth_state(strand, seq, refseq, len(seq) - i) #dige_5_meth_list[i] = meth_5 #dige_5_all_list[i] = all_5 #dige_3_meth_list[10 - i] = meth_3 #dige_3_all_list[10 - i] = all_3 #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, len(seq)] site_meth_list = [dige_5_meth, dige_5_all, 0, 0, dige_5_index, len(seq)] if dige_3_index != 0 and dige_3_index != dige_5_index: if strand[1] == '+': #check the 3' digestion site for '++' and '-+' if len(seq[(dige_3_index - 1):(dige_3_index + 3)]) == 4: if (strand == '++' and seq[dige_3_index + 2] == 'A') or ( strand == '-+' and seq[dige_3_index + 2] == 'T'): site_meth_list = [ dige_5_meth, dige_5_all, dige_3_meth, dige_3_all, dige_5_index, dige_3_index ] #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])] #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])] #for i in range(1, 11): # meth_3, all_3 = get_meth_state(strand, seq, refseq, dige_3_index - 1 - i) # dige_3_meth_list[10 - i] = meth_3 # dige_3_all_list[10 - i] = all_3 #dige_3_meth_list[-1] = dige_3_meth #dige_3_all_list[-1] = dige_3_all #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index] #print "end_repair_3", refseq[dige_3_index:(dige_3_index + 4)], dige_3_meth_list, dige_3_all_list else: site_meth_list = [ dige_5_meth, dige_5_all, dige_3_meth, dige_3_all, dige_5_index, dige_3_index ] #dige_dict[strand]['e'][1] = [sum(x) for x in zip(dige_dict[strand]['e'][1], [dige_3_meth, dige_3_all])] #dige_dict[strand]['e'] = [sum(x) for x in zip(dige_dict[strand]['e'], [dige_3_meth, dige_3_all])] #for i in range(1, 11): # meth_3, all_3 = get_meth_state(strand, seq,refseq, dige_3_index - 1 - i) # dige_3_meth_list[10 - i] = meth_3 # dige_3_all_list[10 - i] = all_3 #dige_3_meth_list[-1] = dige_3_meth #dige_3_all_list[-1] = dige_3_all #site_meth_list = [dige_5_meth_list, dige_5_all_list, dige_3_meth_list, dige_3_all_list, dige_5_index, dige_3_index] #print 'bbbb', dige_3_meth_list, dige_3_all_list #return dige_dict return read_info, site_meth_list
def run(args): """ Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures """ options = args.parse_args() if len(options.sam_file) == 0: error("Missing the SAM file, use -s or --sam option.") else: options.sam_file = options.sam_file.split(',') for s in options.sam_file: if not os.path.isfile(s): error("Can't open the SAM file: " + s) sys.exit(1) if len(options.ref_file) == 0: error("Missing the reference genome fasta file, use -r or --ref option.") else: if not os.path.isfile(options.ref_file): error("Can't open the ref file: " + options.ref_file) if len(options.samtools) != 0: if options.samtools[-1] != '/': options.samtools += '/' if len(options.name) == 0: error("Missing the output file name, use -n or --name options.") sam_inf = options.sam_file ref_file = options.ref_file bsm = options.bsm s_path = options.samtools name = options.name remove_overlap = options.remove_overlap filter_dup = options.filter_dup p_poisson = options.p_poisson gsize = options.gsize not_mapping = options.not_mapping info("Get the all parameter!!") #check the input mapping files sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path) pre_flag = read_inf.readline().split('\t')[1] if 'p' in pre_flag: single_on = False info("The input mapping files are paired-end sequencing!") else: single_on = True info("The input mapping files are single-end sequencing!") loc_dict = {} if filter_dup: ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf info("The filter_dup has been set True.") info("Assess the duplicate reads...") for sam in sam_inf: #check the input mapping files sam_format, read_inf = check.check_mapping_file(sam, s_path) if single_on: for read in read_inf: loc_dict = LI.Loc_single(read, loc_dict, bsm) else: for read in read_inf: loc_dict = LI.Loc_paired(read, loc_dict, bsm) max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name) info('Get the duplicate reads distribution!') #get reference information ref = GR.get_ref(ref_file) trim_position = [] filter_duplicate_reads = 0 filter_nonuniform_trim_bp = 0 filter_nonuniform_trim_bp_CG = 0 filter_remove_overlap_bp = 0 filter_not_mapping_reads = 0 all_reads = 0 not_mapping_reads = 0 all_mapping_bp = 0 ##filter the 5' bisulfite failure for sam in sam_inf: out_sam = sam[:-4] + '_' + name + '_filter.sam' out = open(out_sam, 'w') #check the input mapping files record_mate = {} sam_format, read_inf = check.check_mapping_file_header(sam, s_path) for read in read_inf: #for sam header if read.startswith('@'): out.write(read) continue else: all_reads += 1 ##record the read number (2013-06-20) #Get the read information for trimming #If the read isn't unique mapping, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_info = RI(read, bsm) read_info = read_info.extract_information() if len(read_info) == 0: not_mapping_reads += 1 if not_mapping: #keep the not_unique mapping reads (or not paired mapping) out.write(read) else: filter_not_mapping_reads += 1 ##record the not mapping read number (2013-06-20) continue if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on) else: duplicate = False if single_on: all_mapping_bp += len(read_info[5]) ##record the mapping read basepair (2013-06-20) else: all_mapping_bp += len(read_info[7]) ##record the mapping read basepair (2013-06-20) record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter(read, out, read_info, ref, remove_overlap, duplicate, single_on, record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp) out.close() del record_mate NR.nonuniform_generator(trim_position, name) for i in range(len(trim_position)): filter_nonuniform_trim_bp += i * trim_position[i] ##produce the filter report info('Produce the report file...') report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w') report_out.write('Total reads: %d\n' % all_reads) if single_on: report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique mapping reads:\n') report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) else: report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique paired mapping reads:\n') report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads * 100), "%")) report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % ( filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%")) report_out.close() info('Get the report file!')