def runM3D(infile, outfile, root, design): job_options = "-l mem_free=4G -pe dedicated 1" groups = [x for x in itertools.combinations(EXPERIMENTS, 2)] # **code repeated - refactor** for pair in groups: pair = [re.sub("-agg", "", str(x)) for x in pair] pair1, pair2 = pair pair1_split = pair1.split("-") pair2_split = pair2.split("-") # only want pairs with one difference # e.g treatment or tissue but not both if not (pair1_split[0] != pair2_split[0] and pair1_split[1] != pair2_split[1]): outfile = ("%(root)s%(pair1)s_vs_%(pair2)s.tsv" % locals()) if pair1_split[0] != pair2_split[0]: groups = [pair1_split[0], pair2_split[0]] elif pair1_split[1] != pair2_split[1]: groups = [pair1_split[1], pair2_split[1]] else: E.error("This pair does not contain any comparisons: %(pair)s" % locals()) RRBS.calculateM3DStat(infile, outfile, design, pair=pair, groups=groups, submit=True, job_options=job_options)
def config_to_dictionary(config): """convert the contents of a :py:class:`ConfigParser.ConfigParser` object to a dictionary This method works by iterating over all configuration values in a :py:class:`ConfigParser.ConfigParser` object and inserting values into a dictionary. Section names are prefixed using and underscore. Thus:: [sample] name=12 is entered as ``sample_name=12`` into the dictionary. The sections ``general`` and ``DEFAULT`` are treated specially in that both the prefixed and the unprefixed values are inserted: :: [general] genome=hg19 will be added as ``general_genome=hg19`` and ``genome=hg19``. Numbers will be automatically recognized as such and converted into integers or floats. Returns ------- config : dict A dictionary of configuration values """ p = defaultdict(lambda: defaultdict(TriggeredDefaultFactory())) for section in config.sections(): for key, value in config.items(section): try: v = IOTools.str2val(value) except TypeError: E.error("error converting key %s, value %s" % (key, value)) E.error("Possible multiple concurrent attempts to " "read configuration") raise p["%s_%s" % (section, key)] = v # IMS: new heirarchical format try: p[section][key] = v except TypeError: # fails with things like genome_dir=abc # if [genome] does not exist. continue if section in ("general", "DEFAULT"): p["%s" % (key)] = v for key, value in config.defaults().items(): p["%s" % (key)] = IOTools.str2val(value) return p
def main(argv=None): parser = getOptionParser() (options, args) = E.Start(parser, add_cluster_options=True) if len(args) == 0: raise ValueError( "command line argument missing - see usage information") options.renumber_column = [x.split(":") for x in options.renumber_column] cmd = args[0] if len(args) > 1: cmd += " '" + "' '".join(args[1:]) + "'" if options.dry_run: cmd = re.sub("%DIR%", "", cmd) retcode = subprocess.call(cmd, shell=True, stdin=sys.stdin, stdout=sys.stdout, cwd=os.getcwd(), close_fds=True) E.Stop() sys.exit(0) failed_requests = [] started_requests = [] niterations = 0 if not options.collect: tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir)) E.info(" working in directory %s" % tmpdir) if options.split_at_lines: chunk_iterator = chunk_iterator_lines args = (options.split_at_lines, ) elif options.split_at_column: chunk_iterator = chunk_iterator_column args = (options.split_at_column - 1, options.max_files) elif options.split_at_regex: chunk_iterator = chunk_iterator_regex_split args = (re.compile(options.split_at_regex), 0, options.chunksize, options.max_lines) elif options.group_by_regex: chunk_iterator = chunk_iterator_regex_group args = (re.compile(options.group_by_regex), 0, options.chunksize) else: raise ValueError("please specify a way to chunk input data") data = [(x, cmd, options, None, options.subdirs) for x in chunk_iterator(options.stdin, args, prefix=tmpdir, use_header=options.input_header)] started_requests = [(x[0], x[0] + ".out") for x in data] if len(data) == 0: E.warn("no data received") E.Stop() sys.exit(0) if options.method == "multiprocessing": pool = Pool(options.cluster_num_jobs) results = pool.map(runCommand, data, chunksize=1) elif options.method == "drmaa": results = [] runDRMAA(data, environment=options.environment) elif options.method == "threads": pool = ThreadPool(options.cluster_num_jobs) results = pool.map(runCommand, data, chunksize=1) niterations = 0 for retcode, filename, cmd, logfile, iterations in results: niterations += iterations if not hasFinished(retcode, filename, options.output_tag, logfile): failed_requests.append((filename, cmd)) else: tmpdir = options.collect started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")] E.info("collecting %i files from %s" % (len(started_requests), tmpdir)) if failed_requests: for fn, cmd in failed_requests: E.error("failed request: filename= %s, cmd= %s" % (fn, cmd)) else: E.info("building result from %i parts" % len(started_requests)) if options.renumber: mapper = MapperLocal(pattern=options.renumber) else: mapper = MapperEmpty() # deal with stdout name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, "stdout"): try: index = int(column) - 1 except ValueError: name = column break if options.binary: ResultBuilderBinary()(started_requests, options.stdout, options) else: regex = None if options.output_regex_header: regex = re.compile(options.output_regex_header) ResultBuilder(mapper=mapper, field_index=index, field_name=name, header_regex=regex)(started_requests, options.stdout, options) # deal with logfiles : combine them into a single file rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if rr: E.info("logging output goes to %s" % rr.groups()[0]) logfile = IOTools.openFile(rr.groups()[0], "a") ResultBuilderLog()([(x[0], "%s.log" % x[0]) for x in started_requests], logfile, options) logfile.close() # deal with other files if options.subdirs: files = glob.glob("%s/*.dir/*" % tmpdir) # remove directory filenames = set([os.path.basename(x) for x in files]) xx = len(".out") for filename in filenames: _, filetype = os.path.splitext(filename) name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, filename): try: index = int(column) - 1 except ValueError: name = column break if options.binary: builder = ResultBuilderBinary(mapper=mapper) elif filetype in (".fa", ".fasta"): builder = ResultBuilderFasta(mapper=mapper) elif filetype in (".mali", ): builder = ResultBuilderFasta(mapper=MapperEmpty()) elif filetype in (".psl"): builder = ResultBuilderPSL(mapper=mapper) elif filetype in (".gtf", ".gff"): builder = ResultBuilderGFF(mapper=mapper, field_index=index, field_name=name) elif filetype in (".png"): builder = ResultBuilderCopies(mapper=mapper) else: builder = ResultBuilder(mapper=mapper, field_index=index, field_name=name) E.debug("chose the following builder for %s: %s: %s" % (filename, filetype, str(builder))) E.info("collecting results for %s" % filename) input_filenames = [] for fi, fn in started_requests: fn = fn[:-xx] + ".dir/" + filename if os.path.exists(fn): input_filenames.append((fi, fn)) E.info("output of %i files goes to %s" % (len(filenames), filename)) outfile = IOTools.openFile(options.output_pattern % filename, "w") builder(input_filenames, outfile, options) outfile.close() if not options.debug and (not options.resume or not options.collect): if len(failed_requests) == 0: E.info("removing directory %s" % tmpdir) shutil.rmtree(tmpdir) else: E.info("directory %s not removed due to %i failed jobs" % (tmpdir, len(failed_requests))) E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" % (len(started_requests), len(started_requests) - len(failed_requests), len(failed_requests), niterations)) E.Stop()
def main(options, args, pipeline=None): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- options: object Container for command line arguments. args : list List of command line arguments. pipeline: object Pipeline to run. If not given, all ruffus pipelines are run. """ global GLOBAL_OPTIONS global GLOBAL_ARGS GLOBAL_OPTIONS, GLOBAL_ARGS = options, args logger = logging.getLogger("daisy.pipeline") logger.info("started in workingdir: {}".format(PARAMS.get("workingdir"))) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. update_params_with_commandline_options(PARAMS, options) version = get_version() if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) if options.force_run: if options.force_run == "all": forcedtorun_tasks = pipeline_get_task_names() else: forcedtorun_tasks = options.pipeline_targets else: forcedtorun_tasks = [] # create local scratch if it does not already exists. Note that # directory itself will be not deleted while its contents should # be cleaned up. if not os.path.exists(PARAMS["tmpdir"]): logger.warn( "local temporary directory {} did not exist - created".format( PARAMS["tmpdir"])) try: os.makedirs(PARAMS["tmpdir"]) except OSError: # file exists pass logger.debug("temporary directory is {}".format(PARAMS["tmpdir"])) # set multiprocess to a sensible setting if there is no cluster run_on_cluster = HAS_DRMAA is True and not options.without_cluster if options.multiprocess is None: if not run_on_cluster: options.multiprocess = int( math.ceil(multiprocessing.cpu_count() / 2.0)) else: options.multiprocess = 40 # see inputValidation function in Parameters.py if options.input_validation: input_validation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: logger.info("\t".join(map(str, requirement))) logger.info("version check summary: %s" % str(counter)) E.stop() return elif options.pipeline_action == "debug": # create the session proxy start_session() method_name = options.pipeline_targets[0] caller = get_caller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "state", "svg", "plot", "dot", "touch", "regenerate"): messenger = None try: with cache_os_functions(): if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = StringIO() ruffus.pipeline_printout( stream, options.pipeline_targets, verbose=5, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterProgress(stream.getvalue()) logger.addFilter(messenger) global task if options.without_cluster: # use ThreadPool to avoid taking multiple CPU for pipeline # controller. ruffus.task.Pool = ThreadPool else: # use cooperative multitasking instead of multiprocessing. ruffus.task.Pool = EventPool ruffus.task.queue = gevent.queue # create the session proxy start_session() logger.info("code location: {}".format( PARAMS["scriptsdir"])) logger.info("code version: {}".format(version)) logger.info("working directory is: {}".format( PARAMS["workingdir"])) ruffus.pipeline_run( options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, pipeline=pipeline, one_second_per_job=False, ) close_session() elif options.pipeline_action == "show": ruffus.pipeline_printout( options.stdout, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, pipeline=pipeline, verbose=options.loglevel) elif options.pipeline_action == "svg": ruffus.pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "state": ruffus.ruffus_return_dag( options.stdout, target_tasks=options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() ruffus.pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus.ruffus_exceptions.RethrownJobError as ex: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(ex.args)) for idx, e in enumerate(ex.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) else: task = re.sub("__main__.", "", task) job = re.sub(r"\s", "", job) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.pipeline_logfile) logger.error("start of all error messages") logger.error(ex) logger.error("end of all error messages") raise ValueError("pipeline failed with %i errors" % len(ex.args)) from ex else: raise elif options.pipeline_action == "dump": options.stdout.write((json.dumps(PARAMS)) + "\n") elif options.pipeline_action == "printconfig": E.info("printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) print_config_files() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") write_config_files(pipeline_path, general_path) elif options.pipeline_action == "clone": clone_pipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.stop(logger=get_logger())
def main(args=sys.argv): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ global GLOBAL_OPTIONS global GLOBAL_ARGS parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=("make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force running the pipeline even if there " "are uncommited changes " "in the repository [default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitly set paramater values " "[default=%default].") parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", type="string", help="RabbitMQ exchange to send log messages to " "[default=%default].") parser.add_option("--rabbitmq-host", dest="rabbitmq_host", type="string", help="RabbitMQ host to send log messages to " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.set_defaults(pipeline_action=None, pipeline_format="svg", pipeline_targets=[], multiprocess=40, logfile="pipeline.log", dry_run=False, force=False, log_exceptions=False, exceptions_terminate_immediately=False, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, rabbitmq_host="saruman", rabbitmq_exchange="ruffus_pipelines", input_validation=False) (options, args) = E.Start(parser, add_cluster_options=True) GLOBAL_OPTIONS, GLOBAL_ARGS = options, args E.info("Started in: %s" % PARAMS.get("workingdir")) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. PARAMS["dryrun"] = options.dry_run PARAMS["input_validation"] = options.input_validation # use cli_cluster_* keys in PARAMS to ensure highest priority # of cluster_* options passed with the command-line if options.cluster_memory_default is not None: PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default PARAMS["cluster_memory_default"] = options.cluster_memory_default if options.cluster_memory_resource is not None: PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource PARAMS["cluster_memory_resource"] = options.cluster_memory_resource if options.cluster_num_jobs is not None: PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs PARAMS["cluster_num_jobs"] = options.cluster_num_jobs if options.cluster_options is not None: PARAMS["cli_cluster_options"] = options.cluster_options PARAMS["cluster_options"] = options.cluster_options if options.cluster_parallel_environment is not None: PARAMS[ "cli_cluster_parallel_environment"] = options.cluster_parallel_environment PARAMS[ "cluster_parallel_environment"] = options.cluster_parallel_environment if options.cluster_priority is not None: PARAMS["cli_cluster_priority"] = options.cluster_priority PARAMS["cluster_priority"] = options.cluster_priority if options.cluster_queue is not None: PARAMS["cli_cluster_queue"] = options.cluster_queue PARAMS["cluster_queue"] = options.cluster_queue if options.cluster_queue_manager is not None: PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager PARAMS["cluster_queue_manager"] = options.cluster_queue_manager PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level for variables in options.variables_to_set: variable, value = variables.split("=") PARAMS[variable.strip()] = IOTools.str2val(value.strip()) if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) # see inputValidation function in Parameters.py if options.input_validation: inputValidation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: E.info("\t".join(map(str, requirement))) E.info("version check summary: %s" % str(counter)) E.Stop() return elif options.pipeline_action == "debug": # create the session proxy startSession() method_name = options.pipeline_targets[0] caller = getCaller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "svg", "plot", "touch", "regenerate"): # set up extra file logger handler = logging.FileHandler(filename=options.logfile, mode="a") handler.setFormatter( MultiLineFormatter( '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s' )) logger = logging.getLogger() logger.addHandler(handler) messenger = None try: if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = io.StringIO() pipeline_printout( stream, options.pipeline_targets, verbose=5, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterRabbitMQ( stream.getvalue(), project_name=getProjectName(), pipeline_name=getPipelineName(), host=options.rabbitmq_host, exchange=options.rabbitmq_exchange) logger.addFilter(messenger) if not options.without_cluster: global task # use threading instead of multiprocessing in order to # limit the number of concurrent jobs by using the # GIL # # Note that threading might cause problems with rpy. task.Pool = ThreadPool # create the session proxy startSession() # # make sure we are not logging at the same time in # different processes # # session_mutex = manager.Lock() E.info(E.GetHeader()) E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) E.info("Working directory is: %s" % PARAMS["workingdir"]) pipeline_run( options.pipeline_targets, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, ) E.info(E.GetFooter()) closeSession() elif options.pipeline_action == "show": pipeline_printout( options.stdout, options.pipeline_targets, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": pipeline_run(options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": pipeline_run(options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, verbose=options.loglevel) elif options.pipeline_action == "svg": pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus_exceptions.RethrownJobError as value: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(value.args)) for idx, e in enumerate(value.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) pass else: task = re.sub("__main__.", "", task) job = re.sub("\s", "", job) if messenger: messenger.send_error(task, job, error, msg) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.logfile) # write full traceback to log file only by removing the stdout # handler lhStdout = logger.handlers[0] logger.removeHandler(lhStdout) logger.error("start of error messages") logger.error(value) logger.error("end of error messages") logger.addHandler(lhStdout) # raise error raise ValueError("pipeline failed with %i errors" % len(value.args)) else: raise elif options.pipeline_action == "dump": print(json.dumps(PARAMS)) elif options.pipeline_action == "printconfig": print("Printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) printConfigFiles() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.Stop()