def generator(self): ''' The main method in the class 1. Draw Mbias plot and generate Mbias table fore per read length in per strand 2. Decide the trimming positions based on the Mbias plot and generate the trimming file ''' if len(self.trim_file) != 0: info("Used the trimming file from the user defined!! Ignore the step of automatically deciding trimming.") return self.user_defined_trimming() ref = GR.get_ref(self.ref_file) info("Calculate the M fraction for every position...") # check: are the input SAM files paired-end or single-end strand_p = {} if self.single_on: strand_p['++'] = {} strand_p['-+'] = {} else: strand_p['++'] = {} strand_p['-+'] = {} strand_p['+-'] = {} strand_p['--'] = {} strand_p = self.parser_sambam(strand_p, ref) #modify in 2013-06-04 strand_t_raw = [] name_context = [self.name + '_CG', self.name + '_nonCG'] for i in range(len(strand_p)): strand_t_each = self.decide_trim_bp(strand_p[i]) strand_t_raw.append(strand_t_each) MR.mbias_generator(strand_p[i], strand_t_each, name_context[i]) strand_t = self.decide_final_trimming(strand_t_raw) self.produce_final_trim_file(strand_t) return strand_t
def generator(self): ''' The main method in the class 1. Draw Mbias plot and generate Mbias table fore per read length in per strand 2. Decide the trimming positions based on the Mbias plot and generate the trimming file ''' if len(self.trim_file) != 0: info( "Used the trimming file from the user defined!! Ignore the step of automatically deciding trimming." ) return self.user_defined_trimming() ref = GR.get_ref(self.ref_file) info("Calculate the M fraction for every position...") # check: are the input SAM files paired-end or single-end strand_p = {} if self.single_on: strand_p['++'] = {} strand_p['-+'] = {} else: strand_p['++'] = {} strand_p['-+'] = {} strand_p['+-'] = {} strand_p['--'] = {} strand_p = self.parser_sambam(strand_p, ref) #modify in 2013-06-04 strand_t_raw = [] name_context = [self.name + '_CG', self.name + '_nonCG'] for i in range(len(strand_p)): strand_t_each = self.decide_trim_bp(strand_p[i]) strand_t_raw.append(strand_t_each) MR.mbias_generator(strand_p[i], strand_t_each, name_context[i]) strand_t = self.decide_final_trimming(strand_t_raw) self.produce_final_trim_file(strand_t) return strand_t
def run(args): options = args.parse_args() if len(options.sam_file) == 0: error("Missing the SAM file, use -s or --sam option.") else: options.sam_file = options.sam_file.split(',') for s in options.sam_file: if not os.path.isfile(s): error("Can't open the SAM file: " + s) sys.exit(1) if len(options.ref_file) == 0: error( "Missing the reference genome fasta file, use -r or --ref option.") else: if not os.path.isfile(options.ref_file): error("Can't open the ref file: " + options.ref_file) if len(options.samtools) != 0: if options.samtools[-1] != '/': options.samtools += '/' if len(options.name) == 0: error("Missing the output file name, use -n or --name options.") sam_inf = options.sam_file ref_file = options.ref_file bsm = options.bsm s_path = options.samtools name = options.name dige_site = options.dige_site remove_overlap = options.remove_overlap not_mapping = options.not_mapping info("Get the all parameter!!") #check the input mapping files sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path) pre_flag = read_inf.readline().split('\t')[1] if 'p' in pre_flag: single_on = False info("The input mapping files are paired-end sequencing!") else: single_on = True info("The input mapping files are single-end sequencing!") #get reference information ref = GR.get_ref(ref_file) ##scan MspI site and trim the end-repaired C dige_dict, all_reads, all_mapping_bp, not_mapping_reads, filter_not_mapping_reads, filter_MspI_endrepair_bp, filter_remove_overlap_bp = parser_trim_sambam( sam_inf, ref, bsm, s_path, dige_site, single_on, remove_overlap, not_mapping, name) ##produce MspI Mbias plot RR.generator(dige_dict, single_on, name) ##produce the filter report report(all_reads, all_mapping_bp, not_mapping_reads, filter_not_mapping_reads, filter_MspI_endrepair_bp, filter_remove_overlap_bp, single_on, name)
def run(args): options = args.parse_args() if len(options.sam_file) == 0: error("Missing the SAM file, use -s or --sam option.") else: options.sam_file = options.sam_file.split(',') for s in options.sam_file: if not os.path.isfile(s): error("Can't open the SAM file: " + s) sys.exit(1) if len(options.ref_file) == 0: error("Missing the reference genome fasta file, use -r or --ref option.") else: if not os.path.isfile(options.ref_file): error("Can't open the ref file: " + options.ref_file) if len(options.samtools) != 0: if options.samtools[-1] != '/': options.samtools += '/' if len(options.name) == 0: error("Missing the output file name, use -n or --name options.") sam_inf = options.sam_file ref_file = options.ref_file bsm = options.bsm s_path = options.samtools name = options.name dige_site = options.dige_site remove_overlap = options.remove_overlap not_mapping = options.not_mapping info("Get the all parameter!!") #check the input mapping files sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path) pre_flag = read_inf.readline().split('\t')[1] if 'p' in pre_flag: single_on = False info("The input mapping files are paired-end sequencing!") else: single_on = True info("The input mapping files are single-end sequencing!") #get reference information ref = GR.get_ref(ref_file) ##scan MspI site and trim the end-repaired C dige_dict, all_reads, all_mapping_bp, not_mapping_reads, filter_not_mapping_reads, filter_MspI_endrepair_bp, filter_remove_overlap_bp = parser_trim_sambam( sam_inf, ref, bsm, s_path, dige_site, single_on, remove_overlap, not_mapping, name) ##produce MspI Mbias plot RR.generator(dige_dict, single_on, name) ##produce the filter report report(all_reads, all_mapping_bp, not_mapping_reads, filter_not_mapping_reads, filter_MspI_endrepair_bp, filter_remove_overlap_bp, single_on, name)
def generator(self): ''' The main method in the class 1. Draw Mbias plot and generate Mbias table fore per read length in per strand 2. Decide the trimming positions based on the Mbias plot and generate the trimming file 3. Show the duplicate reads distribution ''' ref = GR.get_ref(self.ref_file) # check: are the input SAM files paired-end or single-end strand_p = {} if self.single_on: strand_p['++'] = {} strand_p['-+'] = {} else: strand_p['++'] = {} strand_p['-+'] = {} strand_p['+-'] = {} strand_p['--'] = {} if len(self.trim_file) != 0: info("Used the trimming file from the user defined!!") info("Ignore both Mbias assessment and trimming decision.") loc_dict = self.parser_sambam(strand_p, ref) strand_t = self.user_defined_trimming() else: #modify in 2013-06-04 strand_t_raw = [] name_context = [self.name + '_CG', self.name + '_nonCG'] strand_p, loc_dict = self.parser_sambam(strand_p, ref) for i in range(len(strand_p)): strand_t_each = self.decide_trim_bp(strand_p[i]) strand_t_raw.append(strand_t_each) MR.mbias_generator(strand_p[i], strand_t_each, name_context[i]) strand_t = self.decide_final_trimming(strand_t_raw) self.produce_final_trim_file(strand_t) max_cov = DR.duplicate_report(loc_dict, self.gsize, self.p_poisson, self.name) return strand_t, loc_dict, max_cov
def run(args): """ Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures """ options = args.parse_args() if len(options.sam_file) == 0: error("Missing the SAM file, use -s or --sam option.") else: options.sam_file = options.sam_file.split(',') for s in options.sam_file: if not os.path.isfile(s): error("Can't open the SAM file: " + s) sys.exit(1) if len(options.ref_file) == 0: error( "Missing the reference genome fasta file, use -r or --ref option.") else: if not os.path.isfile(options.ref_file): error("Can't open the ref file: " + options.ref_file) if len(options.samtools) != 0: if options.samtools[-1] != '/': options.samtools += '/' if len(options.name) == 0: error("Missing the output file name, use -n or --name options.") sam_inf = options.sam_file ref_file = options.ref_file bsm = options.bsm s_path = options.samtools name = options.name remove_overlap = options.remove_overlap filter_dup = options.filter_dup p_poisson = options.p_poisson gsize = options.gsize not_mapping = options.not_mapping info("Get the all parameter!!") #check the input mapping files sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path) pre_flag = read_inf.readline().split('\t')[1] if 'p' in pre_flag: single_on = False info("The input mapping files are paired-end sequencing!") else: single_on = True info("The input mapping files are single-end sequencing!") loc_dict = {} if filter_dup: ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf info("The filter_dup has been set True.") info("Assess the duplicate reads...") for sam in sam_inf: #check the input mapping files sam_format, read_inf = check.check_mapping_file(sam, s_path) if single_on: for read in read_inf: loc_dict = LI.Loc_single(read, loc_dict, bsm) else: for read in read_inf: loc_dict = LI.Loc_paired(read, loc_dict, bsm) max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name) info('Get the duplicate reads distribution!') #get reference information ref = GR.get_ref(ref_file) trim_position = [] filter_duplicate_reads = 0 filter_nonuniform_trim_bp = 0 filter_nonuniform_trim_bp_CG = 0 filter_remove_overlap_bp = 0 filter_not_mapping_reads = 0 all_reads = 0 not_mapping_reads = 0 all_mapping_bp = 0 ##filter the 5' bisulfite failure for sam in sam_inf: out_sam = sam[:-4] + '_' + name + '_filter.sam' out = open(out_sam, 'w') #check the input mapping files record_mate = {} sam_format, read_inf = check.check_mapping_file_header(sam, s_path) for read in read_inf: #for sam header if read.startswith('@'): out.write(read) continue else: all_reads += 1 ##record the read number (2013-06-20) #Get the read information for trimming #If the read isn't unique mapping, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_info = RI(read, bsm) read_info = read_info.extract_information() if len(read_info) == 0: not_mapping_reads += 1 if not_mapping: #keep the not_unique mapping reads (or not paired mapping) out.write(read) else: filter_not_mapping_reads += 1 ##record the not mapping read number (2013-06-20) continue if len( loc_dict ) > 0: #the --filter_dup has been set True, have to remove duplicate reads duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on) else: duplicate = False if single_on: all_mapping_bp += len( read_info[5] ) ##record the mapping read basepair (2013-06-20) else: all_mapping_bp += len( read_info[7] ) ##record the mapping read basepair (2013-06-20) record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter( read, out, read_info, ref, remove_overlap, duplicate, single_on, record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp) out.close() del record_mate NR.nonuniform_generator(trim_position, name) for i in range(len(trim_position)): filter_nonuniform_trim_bp += i * trim_position[i] ##produce the filter report info('Produce the report file...') report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w') report_out.write('Total reads: %d\n' % all_reads) if single_on: report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Unique mapping reads: %d(%.2f%s all reads)\n' % ((all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique mapping reads:\n') report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp) report_out.write( 'Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write( "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write( "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) else: report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Unique paired mapping reads: %d(%.2f%s)\n' % ((all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Skip not paired unique mapping reads: %d(%.2f%s)\n' % (filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique paired mapping reads:\n') report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp) report_out.write( 'Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads * 100), "%")) report_out.write( "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write( "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write( 'Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%")) report_out.close() info('Get the report file!')
def filter_sam(sam_inf, ref_file, bsmb, strand_t, read_l, single_on, name, s_path, auto, remove_overlap, loc_dict, max_cov, not_mapping): ''' Trim the mapping files with the biased positions of every length in every strand, which are saved in the variance: strand_t. ''' filter_duplicate_reads = 0 filter_mbias_trim_bp = 0 filter_mbias_trim_bp_CG = 0 filter_remove_overlap_bp = 0 filter_not_mapping_reads = 0 all_reads = 0 not_mapping_reads = 0 all_mapping_bp = 0 ref = GR.get_ref(ref_file) for sam in sam_inf: out_sam = sam[:-4] + '_' + name + '_filter.sam' out = open(out_sam, 'w') #check the input mapping files sam_format, read_inf = check.check_mapping_file_header(sam, s_path) #scan every read to qc_filter for read in read_inf: #for sam header if read.startswith('@'): out.write(read) continue else: all_reads += 1 ##record the read number (2013-06-20) #Get the read information for trimming #If the read isn't unique mapping, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_info = RI(read, bsmb) read_info = read_info.extract_information() if len(read_info) == 0: not_mapping_reads += 1 if not_mapping: #keep the not_unique mapping reads out.write(read) else: filter_not_mapping_reads += 1 ##record the not mapping read number (2013-06-20) continue if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on) else: duplicate = False if single_on: all_mapping_bp += len(read_info[5]) ##record the mapping read basepair (2013-06-20) if auto: if read_l[0] != '': original_length = int(read_l[sam_inf.index(sam)]) else: original_length = '' filter_mbias_trim_bp, filter_duplicate_reads = SF(read, strand_t, out, read_info, original_length, duplicate, filter_mbias_trim_bp, filter_duplicate_reads) else: if not duplicate and len(loc_dict) > 0: out.write(read) #not trimming, only output not_duplicate reads else: filter_duplicate_reads += 1 ##record the duplicate read (2013-06-20) else: all_mapping_bp += len(read_info[7]) ##record the mapping read basepair (2013-06-20) if auto or remove_overlap: if read_l[0] != '': original_length = [int(i) for i in read_l[sam_inf.index(sam)].split('_')] else: original_length = '' filter_mbias_trim_bp, filter_mbias_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = PF(read, ref, strand_t, out, read_info, original_length, auto, remove_overlap, duplicate, filter_mbias_trim_bp, filter_mbias_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp) else: if not duplicate and len(loc_dict) > 0: out.write(read) #not trimming, only output not_duplicate reads else: filter_duplicate_reads += 1 ##record the duplicate read (2013-06-20) out.close() ##produce the filter report info('Produce the report file...') report_out = open(name + "_BSeQC_mbias_filter_report.txt", 'w') report_out.write('Total reads: %d\n' % all_reads) if single_on: report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique mapping reads:\n') report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) #report_out.write('Filter Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n' % ( #filter_mbias_trim_bp_CG, float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique mapping basepairs)\n' % ( filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%")) else: report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique paired mapping reads:\n') report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % ( filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%")) report_out.write("Filter 5' Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_mbias_trim_bp_CG, float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % ( filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%")) report_out.close() info('Get the report file!')
def filter_sam(sam_inf, ref_file, bsmb, strand_t, read_l, single_on, name, s_path, auto, remove_overlap, loc_dict, max_cov, not_mapping): ''' Trim the mapping files with the biased positions of every length in every strand, which are saved in the variance: strand_t. ''' filter_duplicate_reads = 0 filter_mbias_trim_bp = 0 filter_mbias_trim_bp_CG = 0 filter_remove_overlap_bp = 0 filter_not_mapping_reads = 0 all_reads = 0 not_mapping_reads = 0 all_mapping_bp = 0 ref = GR.get_ref(ref_file) for sam in sam_inf: out_sam = sam[:-4] + '_' + name + '_filter.sam' out = open(out_sam, 'w') #check the input mapping files sam_format, read_inf = check.check_mapping_file_header(sam, s_path) #scan every read to qc_filter for read in read_inf: #for sam header if read.startswith('@'): out.write(read) continue else: all_reads += 1 ##record the read number (2013-06-20) #Get the read information for trimming #If the read isn't unique mapping, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_info = RI(read, bsmb) read_info = read_info.extract_information() if len(read_info) == 0: not_mapping_reads += 1 if not_mapping: #keep the not_unique mapping reads out.write(read) else: filter_not_mapping_reads += 1 ##record the not mapping read number (2013-06-20) continue if len( loc_dict ) > 0: #the --filter_dup has been set True, have to remove duplicate reads duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on) else: duplicate = False if single_on: all_mapping_bp += len( read_info[5] ) ##record the mapping read basepair (2013-06-20) if auto: if read_l[0] != '': original_length = int(read_l[sam_inf.index(sam)]) else: original_length = '' filter_mbias_trim_bp, filter_duplicate_reads = SF( read, strand_t, out, read_info, original_length, duplicate, filter_mbias_trim_bp, filter_duplicate_reads) else: if not duplicate and len(loc_dict) > 0: out.write( read ) #not trimming, only output not_duplicate reads else: filter_duplicate_reads += 1 ##record the duplicate read (2013-06-20) else: all_mapping_bp += len( read_info[7] ) ##record the mapping read basepair (2013-06-20) if auto or remove_overlap: if read_l[0] != '': original_length = [ int(i) for i in read_l[sam_inf.index(sam)].split('_') ] else: original_length = '' filter_mbias_trim_bp, filter_mbias_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = PF( read, ref, strand_t, out, read_info, original_length, auto, remove_overlap, duplicate, filter_mbias_trim_bp, filter_mbias_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp) else: if not duplicate and len(loc_dict) > 0: out.write( read ) #not trimming, only output not_duplicate reads else: filter_duplicate_reads += 1 ##record the duplicate read (2013-06-20) out.close() ##produce the filter report info('Produce the report file...') report_out = open(name + "_BSeQC_mbias_filter_report.txt", 'w') report_out.write('Total reads: %d\n' % all_reads) if single_on: report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Unique mapping reads: %d(%.2f%s all reads)\n' % ((all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique mapping reads:\n') report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp) report_out.write( 'Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) #report_out.write('Filter Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n' % ( #filter_mbias_trim_bp_CG, float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write( 'Filter Mbias basepairs: %d(%.2f%s of unique mapping basepairs)\n' % (filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%")) else: report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Unique paired mapping reads: %d(%.2f%s)\n' % ((all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write( 'Skip not paired unique mapping reads: %d(%.2f%s)\n' % (filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique paired mapping reads:\n') report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp) report_out.write( 'Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write( 'Filter Mbias basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%")) report_out.write( "Filter 5' Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (filter_mbias_trim_bp_CG, float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write( 'Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%")) report_out.close() info('Get the report file!')
def run(args): """ Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures """ options = args.parse_args() if len(options.sam_file) == 0: error("Missing the SAM file, use -s or --sam option.") else: options.sam_file = options.sam_file.split(',') for s in options.sam_file: if not os.path.isfile(s): error("Can't open the SAM file: " + s) sys.exit(1) if len(options.ref_file) == 0: error("Missing the reference genome fasta file, use -r or --ref option.") else: if not os.path.isfile(options.ref_file): error("Can't open the ref file: " + options.ref_file) if len(options.samtools) != 0: if options.samtools[-1] != '/': options.samtools += '/' if len(options.name) == 0: error("Missing the output file name, use -n or --name options.") sam_inf = options.sam_file ref_file = options.ref_file bsm = options.bsm s_path = options.samtools name = options.name remove_overlap = options.remove_overlap filter_dup = options.filter_dup p_poisson = options.p_poisson gsize = options.gsize not_mapping = options.not_mapping info("Get the all parameter!!") #check the input mapping files sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path) pre_flag = read_inf.readline().split('\t')[1] if 'p' in pre_flag: single_on = False info("The input mapping files are paired-end sequencing!") else: single_on = True info("The input mapping files are single-end sequencing!") loc_dict = {} if filter_dup: ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf info("The filter_dup has been set True.") info("Assess the duplicate reads...") for sam in sam_inf: #check the input mapping files sam_format, read_inf = check.check_mapping_file(sam, s_path) if single_on: for read in read_inf: loc_dict = LI.Loc_single(read, loc_dict, bsm) else: for read in read_inf: loc_dict = LI.Loc_paired(read, loc_dict, bsm) max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name) info('Get the duplicate reads distribution!') #get reference information ref = GR.get_ref(ref_file) trim_position = [] filter_duplicate_reads = 0 filter_nonuniform_trim_bp = 0 filter_nonuniform_trim_bp_CG = 0 filter_remove_overlap_bp = 0 filter_not_mapping_reads = 0 all_reads = 0 not_mapping_reads = 0 all_mapping_bp = 0 ##filter the 5' bisulfite failure for sam in sam_inf: out_sam = sam[:-4] + '_' + name + '_filter.sam' out = open(out_sam, 'w') #check the input mapping files record_mate = {} sam_format, read_inf = check.check_mapping_file_header(sam, s_path) for read in read_inf: #for sam header if read.startswith('@'): out.write(read) continue else: all_reads += 1 ##record the read number (2013-06-20) #Get the read information for trimming #If the read isn't unique mapping, we will get a empty list ([]). #In: single unique mapping read Out: [flag,strand,chr,pos,CIGAR,seq,score] #In: paired unique mapping read Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score] read_info = RI(read, bsm) read_info = read_info.extract_information() if len(read_info) == 0: not_mapping_reads += 1 if not_mapping: #keep the not_unique mapping reads (or not paired mapping) out.write(read) else: filter_not_mapping_reads += 1 ##record the not mapping read number (2013-06-20) continue if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on) else: duplicate = False if single_on: all_mapping_bp += len(read_info[5]) ##record the mapping read basepair (2013-06-20) else: all_mapping_bp += len(read_info[7]) ##record the mapping read basepair (2013-06-20) record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter(read, out, read_info, ref, remove_overlap, duplicate, single_on, record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp) out.close() del record_mate NR.nonuniform_generator(trim_position, name) for i in range(len(trim_position)): filter_nonuniform_trim_bp += i * trim_position[i] ##produce the filter report info('Produce the report file...') report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w') report_out.write('Total reads: %d\n' % all_reads) if single_on: report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique mapping reads:\n') report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%")) report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) else: report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % ( not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%")) report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % ( (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%")) report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % ( filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%")) report_out.write('In unique paired mapping reads:\n') report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp) report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % ( filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads * 100), "%")) report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%")) report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % ( filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%")) report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % ( filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%")) report_out.close() info('Get the report file!')