def run(options): options = opt_validate(options) # weights = options.weights info("Read and build bedGraph for each replicate...") reps = [] i = 1 for ifile in options.ifile: info("Read file #%d" % i) reps.append(cBedGraphIO.bedGraphIO(ifile).build_bdgtrack()) i += 1 # first two reps info("combining #1 and #2 with method '%s'" % options.method) cmbtrack = reps[0].overlie(reps[1], func=options.method) ofile = os.path.join(options.outdir, options.ofile) info("Write bedGraph of combined scores...") ofhd = open(ofile, "wb") cmbtrack.write_bedGraph( ofhd, name="%s_combined_scores" % (options.method.upper()), description="Scores calculated by %s" % (options.method.upper()), ) info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
def run( options ): options = opt_validate( options ) info("Read and build bedGraph...") bio = BedGraphIO.bedGraphIO(options.ifile) btrack = bio.build_bdgtrack(baseline_value=0) info("Modify bedGraph...") if options.method.lower() == "p2q": btrack.p2q() elif options.method.lower() == "analen": btrack.analen() else: extraparam = float(options.extraparam[0]) if options.method.lower() == "multiply": btrack.apply_func( lambda x: x * extraparam) elif options.method.lower() == "add": btrack.apply_func( lambda x: x + extraparam) elif options.method.lower() == "max": btrack.apply_func( lambda x: x if x> extraparam else extraparam ) elif options.method.lower() == "min": btrack.apply_func( lambda x: x if x< extraparam else extraparam ) ofile = os.path.join( options.outdir, options.ofile ) info("Write bedGraph of modified scores...") ofhd = open(ofile,"wb") btrack.write_bedGraph(ofhd,name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper())) info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error outputfile = open(options.oprefix+"_refinepeak.bed", "w") peakio = file(options.bedfile) peaks = PeakIO() for l in peakio: fs = l.rstrip().split() peaks.add( fs[0], int(fs[1]), int(fs[2]), name=fs[3] ) peaks.sort() #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) retval = fwtrack.compute_region_tags_from_peaks( peaks, find_summit, window_size = options.windowsize, cutoff = options.cutoff ) outputfile.write( "\n".join( map(lambda x: "%s\t%d\t%d\t%s\t%.2f" % x , retval) ) ) info("Done!") info("Check output file: %s" % options.oprefix+"_refinepeak.bed")
def run(options): options = opt_validate(options) #weights = options.weights info("Read and build bedGraph for each replicate...") reps = [] i = 1 for ifile in options.ifile: info("Read file #%d" % i) reps.append(BedGraphIO.bedGraphIO(ifile).build_bdgtrack()) i += 1 # first two reps info("combining tracks 1-%i with method '%s'" % (i - 1, options.method)) cmbtrack = reps[0].overlie([reps[j] for j in range(1, i - 1)], func=options.method) ofile = os.path.join(options.outdir, options.ofile) info("Write bedGraph of combined scores...") ofhd = open(ofile, "w") cmbtrack.write_bedGraph( ofhd, name="%s_combined_scores" % (options.method.upper()), description="Scores calculated by %s" % (options.method.upper())) info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
def run( options ): options = opt_validate( options ) info("Read and build bedGraph...") bio = BedGraphIO.bedGraphIO(options.ifile) btrack = bio.build_bdgtrack(baseline_value=0) info("Modify bedGraph...") if options.method.lower() == "p2q": btrack.p2q() else: extraparam = float(options.extraparam[0]) if options.method.lower() == "multiply": btrack.apply_func( lambda x: x * extraparam) elif options.method.lower() == "add": btrack.apply_func( lambda x: x + extraparam) elif options.method.lower() == "max": btrack.apply_func( lambda x: x if x> extraparam else extraparam ) elif options.method.lower() == "min": btrack.apply_func( lambda x: x if x< extraparam else extraparam ) ofile = os.path.join( options.outdir, options.ofile ) info("Write bedGraph of modified scores...") ofhd = open(ofile,"w") btrack.write_bedGraph(ofhd,name="%s_modified_scores" % (options.method.upper()),description="Scores calculated by %s" % (options.method.upper())) info("Finished '%s'! Please check '%s'!" % (options.method, ofile))
def run(o_options): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate(o_options) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error if options.outputfile != "stdout": outfhd = open(os.path.join(options.outdir, options.outputfile), "w") else: outfhd = sys.stdout #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options(options) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize t0 = fwtrack.total info(" total tags in alignment file: %d" % (t0)) if options.keepduplicates != "all": if options.keepduplicates == "auto": info( "calculate max duplicate tags in single position based on binomal distribution..." ) max_dup_tags = cal_max_dup_tags(options.gsize, t0) info(" max_dup_tags based on binomal = %d" % (max_dup_tags)) info( "filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) else: info("user defined the maximum tags...") max_dup_tags = int(options.keepduplicates) info( "filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) if not options.dryrun: fwtrack = fwtrack.filter_dup(max_dup_tags) t1 = fwtrack.total else: t1 = fwtrack.filter_dup_dryrun(max_dup_tags) info(" tags after filtering in alignment file: %d" % (t1)) info(" Redundant rate of alignment file: %.2f" % (float(t0 - t1) / t0)) if not options.dryrun: info("Write to BED file") fwtrack.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile) else: info("Dry-run is finished!")
def run( options ): options = opt_validate( options ) scaling_factor = options.sfactor pseudo_depth = 1.0/scaling_factor # not an actual depth, but its reciprocal, a trick to override SPMR while necessary. info("Read and build treatment bedGraph...") tbio = BedGraphIO.bedGraphIO(options.tfile) tbtrack = tbio.build_bdgtrack() info("Read and build control bedGraph...") cbio = BedGraphIO.bedGraphIO(options.cfile) cbtrack = cbio.build_bdgtrack() info("Build scoreTrackII...") sbtrack = tbtrack.make_scoreTrackII_for_macs( cbtrack, depth1 = pseudo_depth, depth2 = pseudo_depth ) if abs(scaling_factor-1) > 1e-6: # Only for the case while your input is SPMR from MACS2 callpeak; Let's override SPMR. info("Values in your input bedGraph files will be multiplied by %f ..." % scaling_factor) sbtrack.change_normalization_method( ord('M') ) # a hack to override SPMR sbtrack.set_pseudocount( options.pseudocount ) already_processed_method_list = [] for (i, method) in enumerate(options.method): if method in already_processed_method_list: continue else: already_processed_method_list.append( method ) info("Calculate scores comparing treatment and control by '%s'..." % method) if options.ofile: ofile = os.path.join( options.outdir, options.ofile[ i ] ) else: ofile = os.path.join( options.outdir, options.oprefix + "_" + method + ".bdg" ) # build score track if method == 'ppois': sbtrack.change_score_method( ord('p') ) elif method == 'qpois': sbtrack.change_score_method( ord('q') ) elif method == 'subtract': sbtrack.change_score_method( ord('d') ) elif method == 'logFE': sbtrack.change_score_method( ord('f') ) elif method == 'FE': sbtrack.change_score_method( ord('F') ) elif method == 'logLR': # log likelihood sbtrack.change_score_method( ord('l') ) elif method == 'slogLR': # log likelihood sbtrack.change_score_method( ord('s') ) elif method == 'max': sbtrack.change_score_method( ord('M') ) else: raise Exception("Can't reach here!") info("Write bedGraph of scores...") ofhd = open(ofile,"wb") sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper()), column = 3) info("Finished '%s'! Please check '%s'!" % (method, ofile))
def run( options ): options = opt_validate( options ) scaling_factor = options.sfactor pseudo_depth = 1.0/scaling_factor # not an actual depth, but its reciprocal, a trick to override SPMR while necessary. info("Read and build treatment bedGraph...") tbio = BedGraphIO.bedGraphIO(options.tfile) tbtrack = tbio.build_bdgtrack() info("Read and build control bedGraph...") cbio = BedGraphIO.bedGraphIO(options.cfile) cbtrack = cbio.build_bdgtrack() info("Build scoreTrackII...") sbtrack = tbtrack.make_scoreTrackII_for_macs( cbtrack, depth1 = pseudo_depth, depth2 = pseudo_depth ) if abs(scaling_factor-1) > 1e-6: # Only for the case while your input is SPMR from MACS2 callpeak; Let's override SPMR. info("Values in your input bedGraph files will be multiplied by %f ..." % scaling_factor) sbtrack.change_normalization_method( ord('M') ) # a hack to override SPMR sbtrack.set_pseudocount( options.pseudocount ) already_processed_method_list = [] for (i, method) in enumerate(options.method): if method in already_processed_method_list: continue else: already_processed_method_list.append( method ) info("Calculate scores comparing treatment and control by '%s'..." % method) if options.ofile: ofile = os.path.join( options.outdir, options.ofile[ i ] ) else: ofile = os.path.join( options.outdir, options.oprefix + "_" + method + ".bdg" ) # build score track if method == 'ppois': sbtrack.change_score_method( ord('p') ) elif method == 'qpois': sbtrack.change_score_method( ord('q') ) elif method == 'subtract': sbtrack.change_score_method( ord('d') ) elif method == 'logFE': sbtrack.change_score_method( ord('f') ) elif method == 'FE': sbtrack.change_score_method( ord('F') ) elif method == 'logLR': # log likelihood sbtrack.change_score_method( ord('l') ) elif method == 'slogLR': # log likelihood sbtrack.change_score_method( ord('s') ) elif method == 'max': sbtrack.change_score_method( ord('M') ) else: raise Exception("Can't reach here!") info("Write bedGraph of scores...") ofhd = open(ofile,"w") sbtrack.write_bedGraph(ofhd,name="%s_Scores" % (method.upper()),description="Scores calculated by %s" % (method.upper()), column = 3) info("Finished '%s'! Please check '%s'!" % (method, ofile))
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error options.PE_MODE = options.format in ('BAMPE','BEDPE') if options.outputfile != "stdout": outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" ) else: outfhd = sys.stdout #1 Read tag files if options.PE_MODE: info("# read input file in Paired-end mode.") inputtrack = load_frag_files_options ( options ) # return PETrackI object t0 = inputtrack.total # total fragments info("# total fragments/pairs in alignment file: %d" % (t0) ) else: info("read tag files...") inputtrack = load_tag_files_options (options) info("tag size = %d" % options.tsize) inputtrack.fw = options.tsize t0 = inputtrack.total info(" total tags in alignment file: %d" % (t0)) if options.keepduplicates != "all": if options.keepduplicates == "auto": info("calculate max duplicate tags in single position based on binomal distribution...") max_dup_tags = cal_max_dup_tags(options.gsize,t0) info(" max_dup_tags based on binomal = %d" % (max_dup_tags)) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) else: info("user defined the maximum tags...") max_dup_tags = int(options.keepduplicates) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) t1 = inputtrack.filter_dup(max_dup_tags) info(" tags after filtering in alignment file: %d" % (t1)) info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0)) if not options.dryrun: info( "Write to BED file" ) inputtrack.print_to_bed( fhd=outfhd ) info( "finished! Check %s." % options.outputfile ) else: info( "Dry-run is finished!" )
def run(options0): options = opt_validate(options0) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error options.PE_MODE = options.format in ('BAMPE', 'BEDPE') #0 check output file if options.outputfile: outfhd = open(os.path.join(options.outdir, options.outputfile), "w") else: outfhd = sys.stdout #1 Read tag files if options.PE_MODE: info("# read input file in Paired-end mode.") treat = load_frag_files_options(options) # return PETrackI object t0 = treat.total # total fragments info("# total fragments/pairs in alignment file: %d" % (t0)) else: info("read tag files...") treat = load_tag_files_options(options) info("tag size = %d" % options.tsize) treat.fw = options.tsize t0 = treat.total info(" total tags in alignment file: %d" % (t0)) if options.number: if options.number > t0: error( " Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!" ) error(" %.2e > %.2e" % (options.number, t0)) sys.exit(1) info(" Number of tags you want to keep: %.2e" % (options.number)) options.percentage = float(options.number) / t0 * 100 info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage)) if options.seed >= 0: info(" Random seed has been set as: %d" % options.seed) treat.sample_percent(options.percentage / 100.0, options.seed) info(" tags after random sampling in alignment file: %d" % (treat.total)) info("Write to BED file") treat.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile)
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments options.PE_MODE = options.format in ('BAMPE','BEDPE') #1 Read tag files if options.PE_MODE: info("# read input file in Paired-end mode.") treat = load_frag_files_options ( options ) # return PETrackI object t0 = treat.total info("# total fragments/pairs in alignment file: %d" % (t0) ) else: info("# read alignment files...") treat = load_tag_files_options (options) t0 = treat.total info("# tag size = %d" % options.tsize) treat.fw = options.tsize info("# total tags in alignment file: %d", t0) #2 Build Model info("# Build Peak Model...") if options.PE_MODE: d = treat.average_template_length info("# Average insertion length of all pairs is %d bps" % d) return try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = options ) info("# finished!") debug("# Summary Model:") debug("# min_tags: %d" % (peakmodel.min_tags)) debug("# d: %d" % (peakmodel.d)) info("# predicted fragment length is %d bps" % peakmodel.d) info("# alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) info("# Generate R script for model : %s" % (options.modelR)) model2r_script(peakmodel,options.modelR, options.rfile ) options.d = peakmodel.d except NotEnoughPairsException: warn("# Can't find enough pairs of symmetric peaks to build model!")
def run( options0 ): options = opt_validate( options0 ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error options.PE_MODE = options.format in ('BAMPE','BEDPE') #0 check output file if options.outputfile: outfhd = open( os.path.join( options.outdir, options.outputfile ), "w" ) else: outfhd = sys.stdout #1 Read tag files if options.PE_MODE: info("# read input file in Paired-end mode.") treat = load_frag_files_options ( options ) # return PETrackI object t0 = treat.total # total fragments info("# total fragments/pairs in alignment file: %d" % (t0) ) else: info("read tag files...") treat = load_tag_files_options (options) info("tag size = %d" % options.tsize) treat.fw = options.tsize t0 = treat.total info(" total tags in alignment file: %d" % (t0)) if options.number: if options.number > t0: error(" Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!") error(" %.2e > %.2e" % (options.number, t0)) sys.exit(1) info(" Number of tags you want to keep: %.2e" % (options.number)) options.percentage = float(options.number)/t0*100 info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage)) if options.seed >= 0: info(" Random seed has been set as: %d" % options.seed ) treat.sample_percent(options.percentage/100.0, options.seed ) info(" tags after random sampling in alignment file: %d" % (treat.total)) info("Write to BED file") treat.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile)
def run(o_options): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate(o_options) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments assert options.format != 'BAMPE', "Pair-end data with BAMPE option currently doesn't work with pileup command. You can pretend your data to be single-end with -f BAM. Please try again!" #0 prepare output file outfile = os.path.join(options.outdir, options.outputfile) if os.path.isfile(outfile): info("# Existing file %s will be replaced!" % outfile) os.unlink(outfile) #1 Read tag files info("# read alignment files...") (tsize, treat) = load_tag_files_options(options) info("# tag size = %d", tsize) t0 = treat.total info("# total tags in alignment file: %d", t0) if options.bothdirection: info( "# Pileup alignment file, extend each read towards up/downstream direction with %d bps" % options.extsize) pileup_and_write(treat, outfile, options.extsize * 2, 1, directional=False, halfextension=False) else: info( "# Pileup alignment file, extend each read towards downstream direction with %d bps" % options.extsize) pileup_and_write(treat, outfile, options.extsize, 1, directional=True, halfextension=False) info("# Done! Check %s" % options.outputfile)
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments options.PE_MODE = options.format in ('BAMPE','BEDPE') #assert options.format != 'BAMPE', "Pair-end data with BAMPE option currently doesn't work with pileup command. You can pretend your data to be single-end with -f BAM. Please try again!" #0 prepare output file outfile = os.path.join( options.outdir, options.outputfile ) if os.path.isfile( outfile ): info("# Existing file %s will be replaced!" % outfile ) os.unlink( outfile ) #1 Read tag files info("# read alignment files...") if options.PE_MODE: info("# read input file in Paired-end mode.") treat = load_frag_files_options ( options ) # return PETrackI object t0 = treat.total # total fragments info("# total fragments/pairs in alignment file: %d" % (t0) ) info("# Pileup paired-end alignment file.") pileup_and_write_pe(treat, outfile ) else: (tsize, treat) = load_tag_files_options (options) info("# tag size = %d", tsize) t0 = treat.total info("# total tags in alignment file: %d", t0) if options.bothdirection: info("# Pileup alignment file, extend each read towards up/downstream direction with %d bps" % options.extsize) pileup_and_write(treat, outfile, options.extsize * 2, 1, directional=False, halfextension=False) else: info("# Pileup alignment file, extend each read towards downstream direction with %d bps" % options.extsize) pileup_and_write(treat, outfile, options.extsize, 1, directional=True, halfextension=False) info("# Done! Check %s" % options.outputfile)
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error outputfile = open(options.oprefix+"_refinepeak.bed", "w") #if options.outputfile != "stdout": # assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile # outfhd = open(options.outputfile,"w") #else: # outfhd = sys.stdout peakio = file(options.bedfile) peaks = PeakIO() for l in peakio: fs = l.rstrip().split() peaks.add( fs[0], int(fs[1]), int(fs[2]), name=fs[3] ) peaks.sort() #for l in peakio: #l = peakio.readline() #fs = l.rstrip().split() #print fs #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) #info("tag size = %d" % options.tsize) #fwtrack.fw = options.tsize retval = fwtrack.compute_region_tags_from_peaks( peaks, find_summit, window_size = options.windowsize, cutoff = options.cutoff ) outputfile.write( "\n".join( map(lambda x: "%s\t%d\t%d\t%s\t%.2f" % x , retval) ) ) info("Done!") info("Check output file: %s" % options.oprefix+"_refinepeak.bed")
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error if options.outputfile != "stdout": assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile outfhd = open(options.outputfile,"w") else: outfhd = sys.stdout #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize t0 = fwtrack.total info(" total tags in alignment file: %d" % (t0)) if options.keepduplicates != "all": if options.keepduplicates == "auto": info("calculate max duplicate tags in single position based on binomal distribution...") max_dup_tags = cal_max_dup_tags(options.gsize,t0) info(" max_dup_tags based on binomal = %d" % (max_dup_tags)) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) else: info("user defined the maximum tags...") max_dup_tags = int(options.keepduplicates) info("filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (max_dup_tags)) fwtrack = fwtrack.filter_dup(max_dup_tags) t1 = fwtrack.total info(" tags after filtering in alignment file: %d" % (t1)) info(" Redundant rate of alignment file: %.2f" % (float(t0-t1)/t0)) info("Write to BED file") fwtrack.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile)
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments assert options.format != 'BAMPE', "Pair-end data with BAMPE option doesn't work with predictd command. You can pretend your data to be single-end with -f BAM. Please try again!" #1 Read tag files info("# read alignment files...") treat = load_tag_files_options (options) info("# tag size = %d", options.tsize) t0 = treat.total info("# total tags in alignment file: %d", t0) #2 Build Model info("# Build Peak Model...") try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = options ) info("# finished!") debug("# Summary Model:") debug("# min_tags: %d" % (peakmodel.min_tags)) debug("# d: %d" % (peakmodel.d)) info("# predicted fragment length is %d bps" % peakmodel.d) info("# alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) info("# Generate R script for model : %s" % (options.modelR)) model2r_script(peakmodel,options.modelR,options.rfile) options.d = peakmodel.d except NotEnoughPairsException: warn("# Can't find enough pairs of symmetric peaks to build model!")
def run(options0): options = opt_validate(options0) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 check output file if options.outputfile: outfhd = open(os.path.join(options.outdir, options.outputfile), "w") else: outfhd = sys.stdout #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options(options) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize t0 = fwtrack.total info(" total tags in alignment file: %d" % (t0)) if options.number: if options.number > t0: error( " Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!" ) error(" %.2e > %.2e" % (options.number, t0)) sys.exit(1) info(" Number of tags you want to keep: %.2e" % (options.number)) options.percentage = float(options.number) / t0 * 100 info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage)) if options.seed >= 0: info(" Random seed has been set as: %d" % options.seed) fwtrack.sample_percent(options.percentage / 100.0, options.seed) info(" tags after random sampling in alignment file: %d" % (fwtrack.total)) info("Write to BED file") fwtrack.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile)
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments assert options.format != 'BAMPE', "Pair-end data with BAMPE option currently doesn't work with pileup command. You can pretend your data to be single-end with -f BAM. Please try again!" #0 prepare output file if options.outputfile != "stdout": assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile outfhd = open(options.outputfile,"w") else: outfhd = sys.stdout #1 Read tag files info("# read alignment files...") (tsize, treat) = load_tag_files_options (options) info("# tag size = %d", tsize) t0 = treat.total info("# total tags in alignment file: %d", t0) if options.bothdirection: info("# Pileup alignment file, extend each read towards up/downstream direction with %d bps" % options.extsize) treat_btrack = unified_pileup_bdg(treat, options.extsize * 2, 1, directional=False, halfextension=False) info("# save bedGraph to %s" % options.outputfile) treat_btrack.write_bedGraph( outfhd, "Pileup", "Pileup track with extsize %d on both directions" % options.extsize, trackline=False ) else: info("# Pileup alignment file, extend each read towards downstream direction with %d bps" % options.extsize) treat_btrack = unified_pileup_bdg(treat, options.extsize, 1, directional=True, halfextension=False) info("# save bedGraph to %s" % options.outputfile) treat_btrack.write_bedGraph( outfhd, "Pileup", "Pileup track with extsize %d on 5' directions" % options.extsize, trackline=False ) info("# Done! Check %s" % options.outputfile)
def run( options0 ): options = opt_validate( options0 ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 check output file if options.outputfile: assert not os.path.exists(options.outputfile), "%s already exists, please check!" % options.outputfile outfhd = open(options.outputfile,"w") else: outfhd = sys.stdout #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize t0 = fwtrack.total info(" total tags in alignment file: %d" % (t0)) if options.number: if options.number > t0: error(" Number you want is bigger than total number of tags in alignment file! Please specify a smaller number and try again!") error(" %.2e > %.2e" % (options.number, t0)) sys.exit(1) info(" Number of tags you want to keep: %.2e" % (options.number)) options.percentage = float(options.number)/t0*100 info(" Percentage of tags you want to keep: %.2f%%" % (options.percentage)) if options.seed >= 0: info(" Random seed has been set as: %d" % options.seed ) fwtrack.sample_percent(options.percentage/100.0, options.seed ) info(" tags after random sampling in alignment file: %d" % (fwtrack.total)) info("Write to BED file") fwtrack.print_to_bed(fhd=outfhd) info("finished! Check %s." % options.outputfile)
def run( o_options ): """The calculation based on the binomial distribution for how many tags to allow at one site. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error if options.outputfile != "stdout": outfhd = open( os.path.join( options.outdir, options.outputfile ) ,"w" ) else: outfhd = sys.stdout #1 Read tag files if options.ifile: if not options.quiet: info("counting tags from input files...") fwtrack = load_tag_files_options (options) t0 = fwtrack.total if not options.quiet: info(" total tags in alignment file: %d" % (t0)) info("tag size = %d" % options.tsize) fwtrack.fw = options.tsize elif options.numTags: t0 = options.numTags if not options.quiet: info("calculate max duplicate tags in single position based on binomal distribution...") max_dup_tags = cal_max_dup_tags(options.gsize,t0) if not options.quiet: info(" max_dup_tags based on binomal = %d" % (max_dup_tags)) else: print max_dup_tags
def run( o_options ): """The Main function/pipeline for duplication filter. """ # Parse options... options = opt_validate( o_options ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error if options.ofile: outputfile = open( os.path.join( options.outdir, options.ofile ), 'w' ) options.oprefix = options.ofile else: outputfile = open( os.path.join( options.outdir, "%s_refinepeak.bed" % options.oprefix), "w" ) peakio = open(options.bedfile,"rb") peaks = PeakIO() for l in peakio: fs = l.rstrip().split() peaks.add( fs[0], int(fs[1]), int(fs[2]), name=fs[3] ) peaks.sort() peakio.close() #1 Read tag files info("read tag files...") fwtrack = load_tag_files_options (options) retval = fwtrack.compute_region_tags_from_peaks( peaks, find_summit, window_size = options.windowsize, cutoff = options.cutoff ) outputfile.write( (b"\n".join( [b"%s\t%d\t%d\t%s\t%.2f" % x for x in retval] )).decode() ) outputfile.close() info("Done!")
def run( args ): """The Main function/pipeline for MACS. """ # Parse options... options = opt_validate( args ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments info("\n"+options.argtxt) options.PE_MODE = options.format in ('BAMPE',) if options.PE_MODE: tag = 'fragment' # call things fragments not tags else: tag = 'tag' tempfile.tempdir = options.tempdir #1 Read tag files info("#1 read %s files...", tag) if options.PE_MODE: (treat, control) = load_frag_files_options (options) else: (treat, control) = load_tag_files_options (options) if control is not None: check_names(treat, control, error) info("#1 %s size = %d", tag, options.tsize) tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize) t0 = treat.total tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0) info("#1 total %ss in treatment: %d", tag, t0) # not ready yet # options.filteringmodel = True # if options.filteringmodel: # treat.separate_dups() # t0 = treat.total + treat.dups.total # t1 = treat.total # info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0) # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) # elif options.keepduplicates != "all": if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag) treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0) info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) treatment_max_dup_tags = int(options.keepduplicates) if options.PE_MODE: info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags) else: info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags) treat.separate_dups(treatment_max_dup_tags) # changed 5-29 # treat.filter_dup(treatment_max_dup_tags) t1 = treat.total info("#1 %ss after filtering in treatment: %d", tag, t1) tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1) if options.PE_MODE: tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags) else: tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0) tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) else: t1 = t0 if control is not None: c0 = control.total tagsinfo += "# total %ss in control: %d\n" % (tag, c0) info("#1 total %ss in control: %d", tag, c0) # not ready yet #if options.filteringmodel: # control.separate_dups() # c0 = treat.total + treat.dups.total # c1 = treat.total # info("#1 Redundant rate of treatment: %.2f", float(c0 - c1) / c0) # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0) #elif options.keepduplicates != "all": if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag) control_max_dup_tags = cal_max_dup_tags(options.gsize,c0) info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) control_max_dup_tags = int(options.keepduplicates) if options.PE_MODE: info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags) else: info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags) # control.filter_dup(treatment_max_dup_tags) control.separate_dups(treatment_max_dup_tags) # changed 5-29 c1 = control.total info("#1 %ss after filtering in control: %d", tag, c1) tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1) if options.PE_MODE: tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags) else: tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of control: %.2f" % (float(c0-c1)/c0)) tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0) else: c1 = c0 info("#1 finished!") #2 Build Model info("#2 Build Peak Model...") if options.nomodel: info("#2 Skipped...") if options.PE_MODE: #options.shiftsize = 0 options.d = options.tsize else: options.d=options.extsize if options.shift > 0: info("#2 Sequencing ends will be shifted towards 3' by %d bp(s)" % (options.shift)) elif options.shift < 0: info("#2 Sequencing ends will be shifted towards 5' by %d bp(s)" % (options.shift * -1)) info("#2 Use %d as fragment length" % (options.d)) options.scanwindow=2*options.d # remove the effect of --bw else: try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = options ) info("#2 finished!") debug("#2 Summary Model:") debug("#2 min_tags: %d" % (peakmodel.min_tags)) debug("#2 d: %d" % (peakmodel.d)) debug("#2 scan_window: %d" % (peakmodel.scan_window)) info("#2 predicted fragment length is %d bps" % peakmodel.d) info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) info("#2.2 Generate R script for model : %s" % (options.modelR)) model2r_script(peakmodel,options.modelR,options.name) options.d = peakmodel.d options.scanwindow= 2*options.d if options.d <= 2*options.tsize: warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d)) if options.onauto: options.d=options.extsize options.scanwindow=2*options.d warn("#2 MACS will use %d as EXTSIZE/fragment length d. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.d)) else: warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel.alternative_d))) warn("#2 You can restart the process with --nomodel --extsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.") except NotEnoughPairsException: if not options.onauto: sys.exit(1) warn("#2 Skipped...") options.d=options.extsize options.scanwindow=2*options.d warn("#2 Since --fix-bimodal is set, MACS will use %d as fragment length" % (options.d)) #3 Call Peaks info("#3 Call peaks...") if options.nolambda: info("# local lambda is disabled!") # decide options.tocontrol according to options.tolarge if control and options.PE_MODE: c1 = c1 * 2 if control: if options.downsample: # use random sampling to balance treatment and control info("#3 User prefers to use random sampling instead of linear scaling.") if t1 > c1: info("#3 MACS is random sampling treatment %ss...", tag) if options.seed < 0: warn("#3 Your results may not be reproducible due to the random sampling!") else: info("#3 Random seed (%d) is used." % options.seed) treat.sample_num(c1, options.seed) info("#3 %d Tags from treatment are kept", treat.total) elif c1 > t1: info("#3 MACS is random sampling control %ss...", tag) if options.seed < 0: warn("#3 Your results may not be reproducible due to the random sampling!") else: info("#3 Random seed (%d) is used." % options.seed) control.sample_num(t1, options.seed) info("#3 %d %ss from control are kept", control.total, tag) # set options.tocontrol although it would;t matter now options.tocontrol = False else: if options.tolarge: if t1 > c1: # treatment has more tags than control, since tolarge is # true, we will scale control to treatment. options.tocontrol = False else: # treatment has less tags than control, since tolarge is # true, we will scale treatment to control. options.tocontrol = True else: if t1 > c1: # treatment has more tags than control, since tolarge is # false, we will scale treatment to control. options.tocontrol = True else: # treatment has less tags than control, since tolarge is # false, we will scale control to treatment. options.tocontrol = False peakdetect = PeakDetect(treat = treat, control = control, opt = options ) peakdetect.call_peaks() # filter out low FE peaks peakdetect.peaks.filter_fc( fc_low = options.fecutoff ) #call refinepeak if needed. # if options.refine_peaks: # info("#3 now put back duplicate reads...") # treat.addback_dups() # info("#3 calculate reads balance to refine peak summits...") # refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 ) # info("#3 reassign scores for newly refined peak summits...") # peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace # #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" ) # #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w") # #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline ) #diag_result = peakdetect.diag_result() #4 output #4.1 peaks in XLS info("#4 Write output xls file... %s" % (options.peakxls)) ofhd_xls = open( options.peakxls, "w" ) ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION)) ofhd_xls.write(options.argtxt+"\n") ofhd_xls.write(tagsinfo) if options.shift > 0: ofhd_xls.write("#2 Sequencing ends will be shifted towards 3' by %d bp(s)\n" % (options.shift)) elif options.shift < 0: ofhd_xls.write("#2 Sequencing ends will be shifted towards 5' by %d bp(s)\n" % (options.shift * -1)) ofhd_xls.write("# d = %d\n" % (options.d)) try: ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d))) except: # when --nomodel is used, there is no peakmodel object. Simply skip this line. pass if options.nolambda: ofhd_xls.write("# local lambda is disabled!\n") # pass write method so we can print too, and include name peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name) ofhd_xls.close() #4.2 peaks in BED if options.log_pvalue: score_column = "pscore" elif options.log_qvalue: score_column = "qscore" #4.2 peaks in narrowPeak if not options.broad: #info("#4 Write peak bed file... %s" % (options.peakbed)) #ofhd_bed = open(options.peakbed,"w") #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline) #ofhd_bed.close() info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak)) ofhd_bed = open( options.peakNarrowPeak, "w" ) peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, score_column=score_column, trackline=options.trackline ) ofhd_bed.close() #4.2-2 summits in BED info("#4 Write summits bed file... %s" % (options.summitbed)) ofhd_summits = open( options.summitbed, "w" ) peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_", name=options.name, description="Summits for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline ) ofhd_summits.close() #4.2 broad peaks in bed12 or gappedPeak else: info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak)) ofhd_bed = open( options.peakBroadPeak, "w" ) peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline) ofhd_bed.close() info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak)) ofhd_bed = open( options.peakGappedPeak, "w" ) peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline) ofhd_bed.close() info("Done!")
def run( args ): """The Main function/pipeline for MACS. """ # Parse options... options = opt_validate( args ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments info("\n"+options.argtxt) options.PE_MODE = options.format in ('BAMPE',) if options.PE_MODE: tag = 'fragment' # call things fragments not tags else: tag = 'tag' #1 Read tag files info("#1 read %s files...", tag) if options.PE_MODE: (treat, control) = load_frag_files_options (options) else: (treat, control) = load_tag_files_options (options) if control is not None: check_names(treat, control, error) info("#1 %s size = %d", tag, options.tsize) tagsinfo = "# %s size is determined as %d bps\n" % (tag, options.tsize) t0 = treat.total tagsinfo += "# total %ss in treatment: %d\n" % (tag, t0) info("#1 total %ss in treatment: %d", tag, t0) # not ready yet # options.filteringmodel = True # if options.filteringmodel: # treat.separate_dups() # t0 = treat.total + treat.dups.total # t1 = treat.total # info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0) # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) # elif options.keepduplicates != "all": if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 calculate max duplicate %ss in single position based on binomial distribution...", tag) treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0) info("#1 max_dup_tags based on binomial = %d" % (treatment_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) treatment_max_dup_tags = int(options.keepduplicates) if options.PE_MODE: info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags) else: info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags) treat.separate_dups(treatment_max_dup_tags) # changed 5-29 # treat.filter_dup(treatment_max_dup_tags) t1 = treat.total info("#1 %ss after filtering in treatment: %d", tag, t1) tagsinfo += "# %ss after filtering in treatment: %d\n" % (tag, t1) if options.PE_MODE: tagsinfo += "# maximum duplicate fragments in treatment = %d\n" % (treatment_max_dup_tags) else: tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of treatment: %.2f", float(t0 - t1) / t0) tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) else: t1 = t0 if control is not None: c0 = control.total tagsinfo += "# total %ss in control: %d\n" % (tag, c0) info("#1 total %ss in control: %d", tag, c0) # not ready yet #if options.filteringmodel: # control.separate_dups() # c0 = treat.total + treat.dups.total # c1 = treat.total # info("#1 Redundant rate of treatment: %.2f", float(c0 - c1) / c0) # tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(c0-c1)/c0) #elif options.keepduplicates != "all": if options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 for control, calculate max duplicate %ss in single position based on binomial distribution...", tag) control_max_dup_tags = cal_max_dup_tags(options.gsize,c0) info("#1 max_dup_tags based on binomial = %d" % (control_max_dup_tags)) else: info("#1 user defined the maximum %ss...", tag) control_max_dup_tags = int(options.keepduplicates) if options.PE_MODE: info("#1 filter out redundant fragments by allowing at most %d identical fragment(s)", treatment_max_dup_tags) else: info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)", treatment_max_dup_tags) # control.filter_dup(treatment_max_dup_tags) control.separate_dups(treatment_max_dup_tags) # changed 5-29 c1 = control.total info("#1 %ss after filtering in control: %d", tag, c1) tagsinfo += "# %ss after filtering in control: %d\n" % (tag, c1) if options.PE_MODE: tagsinfo += "# maximum duplicate fragments in control = %d\n" % (treatment_max_dup_tags) else: tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of control: %.2f" % (float(c0-c1)/c0)) tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0) else: c1 = c0 info("#1 finished!") #2 Build Model info("#2 Build Peak Model...") if options.nomodel: info("#2 Skipped...") if options.PE_MODE: options.shiftsize = 0 options.d = options.tsize else: options.d=options.shiftsize*2 info("#2 Use %d as fragment length" % (options.d)) options.scanwindow=2*options.d # remove the effect of --bw else: try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = options ) info("#2 finished!") debug("#2 Summary Model:") debug("#2 min_tags: %d" % (peakmodel.min_tags)) debug("#2 d: %d" % (peakmodel.d)) debug("#2 scan_window: %d" % (peakmodel.scan_window)) info("#2 predicted fragment length is %d bps" % peakmodel.d) info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) info("#2.2 Generate R script for model : %s" % (options.modelR)) model2r_script(peakmodel,options.modelR,options.name) options.d = peakmodel.d options.scanwindow= 2*options.d if options.d <= 2*options.tsize: warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d)) if options.onauto: options.d=options.shiftsize*2 options.scanwindow=2*options.d warn("#2 MACS will use %d as shiftsize, %d as fragment length. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.shiftsize,options.d)) else: warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel.alternative_d))) warn("#2 You can restart the process with --nomodel --shiftsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.") except NotEnoughPairsException: if not options.onauto: sys.exit(1) warn("#2 Skipped...") options.d=options.shiftsize*2 options.scanwindow=2*options.d warn("#2 Since --fix-bimodal is set, MACS will use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d)) #3 Call Peaks info("#3 Call peaks...") if options.nolambda: info("# local lambda is disabled!") # decide options.tocontrol according to options.tolarge if control: if options.downsample: # use random sampling to balance treatment and control info("#3 User prefers to use random sampling instead of linear scaling.") if t1 > c1: info("#3 MACS is random sampling treatment %ss...", tag) if options.seed < 0: warn("#3 Your results may not be reproducible due to the random sampling!") else: info("#3 Random seed (%d) is used." % options.seed) treat.sample_num(c1, options.seed) info("#3 %d Tags from treatment are kept", treat.total) elif c1 > t1: info("#3 MACS is random sampling control %ss...", tag) if options.seed < 0: warn("#3 Your results may not be reproducible due to the random sampling!") else: info("#3 Random seed (%d) is used." % options.seed) control.sample_num(t1, options.seed) info("#3 %d %ss from control are kept", control.total, tag) # set options.tocontrol although it would;t matter now options.tocontrol = False else: if options.tolarge: if t1 > c1: # treatment has more tags than control, since tolarge is # true, we will scale control to treatment. options.tocontrol = False else: # treatment has less tags than control, since tolarge is # true, we will scale treatment to control. options.tocontrol = True else: if t1 > c1: # treatment has more tags than control, since tolarge is # false, we will scale treatment to control. options.tocontrol = True else: # treatment has less tags than control, since tolarge is # false, we will scale control to treatment. options.tocontrol = False peakdetect = PeakDetect(treat = treat, control = control, opt = options ) peakdetect.call_peaks() #call refinepeak if needed. # if options.refine_peaks: # info("#3 now put back duplicate reads...") # treat.addback_dups() # info("#3 calculate reads balance to refine peak summits...") # refined_peaks = treat.refine_peak_from_tags_distribution ( peakdetect.peaks, options.d, 0 ) # info("#3 reassign scores for newly refined peak summits...") # peakdetect.peaks = peakdetect.scoretrack.reassign_peaks( refined_peaks ) # replace # #info("#3 write to file: %s ..." % options.name+"_refined_peaks.encodePeak" ) # #refinedpeakfile = open(options.name+"_refined_peaks.encodePeak", "w") # #refined_peaks.write_to_narrowPeak (refinedpeakfile, name_prefix="%s_refined_peak_", name=options.name, score_column=score_column, trackline=options.trackline ) #diag_result = peakdetect.diag_result() #4 output #4.1 peaks in XLS info("#4 Write output xls file... %s" % (options.peakxls)) ofhd_xls = open(options.peakxls,"w") ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION)) ofhd_xls.write(options.argtxt+"\n") ofhd_xls.write(tagsinfo) ofhd_xls.write("# d = %d\n" % (options.d)) try: ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d))) except: # when --nomodel is used, there is no peakmodel object. Simply skip this line. pass if options.nolambda: ofhd_xls.write("# local lambda is disabled!\n") # pass write method so we can print too, and include name peakdetect.peaks.write_to_xls(ofhd_xls, name = options.name) ofhd_xls.close() #4.2 peaks in BED if options.log_pvalue: score_column = "pscore" elif options.log_qvalue: score_column = "qscore" #4.2 peaks in narrowPeak if not options.broad: #info("#4 Write peak bed file... %s" % (options.peakbed)) #ofhd_bed = open(options.peakbed,"w") #peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline) #ofhd_bed.close() info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak)) ofhd_bed = open(options.peakNarrowPeak,"w") peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, score_column=score_column, trackline=options.trackline ) ofhd_bed.close() #4.2-2 summits in BED info("#4 Write summits bed file... %s" % (options.summitbed)) ofhd_summits = open(options.summitbed,"w") peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_", name=options.name, description="Summits for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline ) ofhd_summits.close() #4.2 broad peaks in bed12 or gappedPeak else: info("#4 Write broad peak in broadPeak format file... %s" % (options.peakBroadPeak)) ofhd_bed = open(options.peakBroadPeak,"w") peakdetect.peaks.write_to_broadPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline) ofhd_bed.close() info("#4 Write broad peak in bed12/gappedPeak format file... %s" % (options.peakGappedPeak)) ofhd_bed = open(options.peakGappedPeak,"w") peakdetect.peaks.write_to_gappedPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline) ofhd_bed.close() info("Done!")
def run( args ): """The Main function/pipeline for MACS. """ # Parse options... options = opt_validate( args ) # end of parsing commandline options info = options.info warn = options.warn debug = options.debug error = options.error #0 output arguments info("\n"+options.argtxt) #1 Read tag files info("#1 read tag files...") (treat, control) = load_tag_files_options (options) # check common chromosome names if control: tchrnames = set(treat.get_chr_names()) cchrnames = set(control.get_chr_names()) commonnames = tchrnames.intersection(cchrnames) if len(commonnames)==0: error("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...") error("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames))) error("Chromosome names in control: %s" % ",".join(sorted(cchrnames))) sys.exit() info("#1 tag size = %d" % options.tsize) tagsinfo = "# tag size is determined as %d bps\n" % (options.tsize) t0 = treat.total tagsinfo += "# total tags in treatment: %d\n" % (t0) info("#1 total tags in treatment: %d" % (t0)) if options.format == 'BAMPE': info("#1 BAMPE mode does not filter duplicates. Use samtools rmdup if needed ...") t1 = treat.total elif options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 calculate max duplicate tags in single position based on binomal distribution...") treatment_max_dup_tags = cal_max_dup_tags(options.gsize,t0) info("#1 max_dup_tags based on binomal = %d" % (treatment_max_dup_tags)) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags)) else: info("#1 user defined the maximum tags...") treatment_max_dup_tags = int(options.keepduplicates) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags)) treat.filter_dup(treatment_max_dup_tags) t1 = treat.total info("#1 tags after filtering in treatment: %d" % (t1)) tagsinfo += "# tags after filtering in treatment: %d\n" % (t1) tagsinfo += "# maximum duplicate tags at the same position in treatment = %d\n" % (treatment_max_dup_tags) info("#1 Redundant rate of treatment: %.2f" % (float(t0-t1)/t0)) tagsinfo += "# Redundant rate in treatment: %.2f\n" % (float(t0-t1)/t0) else: t1 = treat.total if control: c0 = control.total tagsinfo += "# total tags in control: %d\n" % (c0) info("#1 total tags in control: %d" % (c0)) if options.format == 'BAMPE': info("#1 BAMPE mode does not filter duplicates. Use samtools rmdup if needed ...") c1 = control.total elif options.keepduplicates != "all": if options.keepduplicates == "auto": info("#1 for control, calculate max duplicate tags in single position based on binomal distribution...") control_max_dup_tags = cal_max_dup_tags(options.gsize,c0) info("#1 max_dup_tags based on binomal = %d" % (control_max_dup_tags)) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (control_max_dup_tags)) else: info("#1 user defined the maximum tags...") control_max_dup_tags = int(options.keepduplicates) info("#1 filter out redundant tags at the same location and the same strand by allowing at most %d tag(s)" % (treatment_max_dup_tags)) control.filter_dup(control_max_dup_tags) c1 = control.total info("#1 tags after filtering in control: %d" % (c1)) tagsinfo += "# tags after filtering in control: %d\n" % (c1) tagsinfo += "# maximum duplicate tags at the same position in control = %d\n" % (control_max_dup_tags) info("#1 Redundant rate of control: %.2f" % (float(c0-c1)/c0)) tagsinfo += "# Redundant rate in control: %.2f\n" % (float(c0-c1)/c0) else: c1 = control.total info("#1 finished!") #2 Build Model info("#2 Build Peak Model...") if options.nomodel: info("#2 Skipped...") if options.format == 'BAMPE': options.shiftsize = 0 options.d = options.tsize else: options.d=options.shiftsize*2 info("#2 Use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d)) options.scanwindow=2*options.d # remove the effect of --bw else: try: peakmodel = PeakModel(treatment = treat, max_pairnum = MAX_PAIRNUM, opt = options ) info("#2 finished!") debug("#2 Summary Model:") debug("#2 min_tags: %d" % (peakmodel.min_tags)) debug("#2 d: %d" % (peakmodel.d)) debug("#2 scan_window: %d" % (peakmodel.scan_window)) info("#2 predicted fragment length is %d bps" % peakmodel.d) info("#2 alternative fragment length(s) may be %s bps" % ','.join(map(str,peakmodel.alternative_d))) info("#2.2 Generate R script for model : %s" % (options.modelR)) model2r_script(peakmodel,options.modelR,options.name) options.d = peakmodel.d options.scanwindow= 2*options.d if options.d <= 2*options.tsize: warn("#2 Since the d (%.0f) calculated from paired-peaks are smaller than 2*tag length, it may be influenced by unknown sequencing problem!" % (options.d)) if options.onauto: options.d=options.shiftsize*2 options.scanwindow=2*options.d warn("#2 MACS will use %d as shiftsize, %d as fragment length. NOTE: if the d calculated is still acceptable, please do not use --fix-bimodal option!" % (options.shiftsize,options.d)) else: warn("#2 You may need to consider one of the other alternative d(s): %s" % ','.join(map(str,peakmodel.alternative_d))) warn("#2 You can restart the process with --nomodel --shiftsize XXX with your choice or an arbitrary number. Nontheless, MACS will continute computing.") except NotEnoughPairsException: if not options.onauto: sys.exit(1) warn("#2 Skipped...") options.d=options.shiftsize*2 options.scanwindow=2*options.d warn("#2 Since --fix-bimodal is set, MACS will use %d as shiftsize, %d as fragment length" % (options.shiftsize,options.d)) #3 Call Peaks info("#3 Call peaks...") if options.nolambda: info("# local lambda is disabled!") # decide options.tocontrol according to options.tolarge if control: if options.downsample: # use random sampling to balance treatment and control info("#3 User prefers to use random sampling instead of linear scaling.") if t1 > c1: info("#3 MACS is random sampling treatment tags...") warn("#3 Your results may not be reproducible due to the random sampling!") treat.sample_num(c1) info("#3 %d tags from treatment are kept" % treat.total) elif c1 > t1: info("#3 MACS is random sampling control tags...") warn("#3 Your results may not be reproducible due to the random sampling!") control.sample_num(t1) info("#3 %d tags from control are kept" % control.total) # set options.tocontrol although it would;t matter now options.tocontrol = False else: if options.tolarge: if t1 > c1: # treatment has more tags than control, since tolarge is # true, we will scale control to treatment. options.tocontrol = False else: # treatment has less tags than control, since tolarge is # true, we will scale treatment to control. options.tocontrol = True else: if t1 > c1: # treatment has more tags than control, since tolarge is # false, we will scale treatment to control. options.tocontrol = True else: # treatment has less tags than control, since tolarge is # false, we will scale control to treatment. options.tocontrol = False peakdetect = PeakDetect(treat = treat, control = control, opt = options ) peakdetect.call_peaks() #diag_result = peakdetect.diag_result() #4 output #4.1 peaks in XLS info("#4 Write output xls file... %s" % (options.peakxls)) ofhd_xls = open(options.peakxls,"w") ofhd_xls.write("# This file is generated by MACS version %s\n" % (MACS_VERSION)) ofhd_xls.write(options.argtxt+"\n") ofhd_xls.write(tagsinfo) ofhd_xls.write("# d = %d\n" % (options.d)) try: ofhd_xls.write("# alternative fragment length(s) may be %s bps\n" % ','.join(map(str,peakmodel.alternative_d))) except: # when --nomodel is used, there is no peakmodel object. Simply skip this line. pass if options.nolambda: ofhd_xls.write("# local lambda is disabled!\n") peakdetect.toxls(ofhd_xls, name = options.name) ofhd_xls.close() #4.2 peaks in BED if options.log_pvalue: score_column = "pscore" elif options.log_qvalue: score_column = "qscore" info("#4 Write peak bed file... %s" % (options.peakbed)) ofhd_bed = open(options.peakbed,"w") peakdetect.peaks.write_to_bed (ofhd_bed, name_prefix="%s_peak_", name = options.name, description="Peaks for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline) ofhd_bed.close() #4.2 peaks in narrowPeak info("#4 Write peak in narrowPeak format file... %s" % (options.peakNarrowPeak)) ofhd_bed = open(options.peakNarrowPeak,"w") peakdetect.peaks.write_to_narrowPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, score_column=score_column, trackline=options.trackline ) ofhd_bed.close() #4.2 broad peaks in bed12 if options.broad: info("#4 Write broad peak in bed12 format file... %s" % (options.peakBroadPeak)) ofhd_bed = open(options.peakBroadPeak,"w") peakdetect.broadpeaks.write_to_gappedPeak (ofhd_bed, name_prefix="%s_peak_", name=options.name, description=options.name, trackline=options.trackline) ofhd_bed.close() #4.2-2 summits in BED info("#4 Write summits bed file... %s" % (options.summitbed)) ofhd_summits = open(options.summitbed,"w") peakdetect.peaks.write_to_summit_bed (ofhd_summits, name_prefix="%s_peak_", name=options.name, description="Summits for %s (Made with MACS v2, " + strftime("%x") + ")", score_column=score_column, trackline=options.trackline ) ofhd_summits.close()