def configToDictionary(config): """convert the contents of a :py:class:`ConfigParser.ConfigParser` object to a dictionary This method works by iterating over all configuration values in a :py:class:`ConfigParser.ConfigParser` object and inserting values into a dictionary. Section names are prefixed using and underscore. Thus:: [sample] name=12 is entered as ``sample_name=12`` into the dictionary. The sections ``general`` and ``DEFAULT`` are treated specially in that both the prefixed and the unprefixed values are inserted: :: [general] genome=hg19 will be added as ``general_genome=hg19`` and ``genome=hg19``. Numbers will be automatically recognized as such and converted into integers or floats. Returns ------- config : dict A dictionary of configuration values """ p = {} for section in config.sections(): for key, value in config.items(section): try: v = IOTools.str2val(value) except TypeError: E.error("error converting key %s, value %s" % (key, value)) E.error("Possible multiple concurrent attempts to " "read configuration") raise p["%s_%s" % (section, key)] = v if section in ("general", "DEFAULT"): p["%s" % (key)] = v for key, value in config.defaults().items(): p["%s" % (key)] = IOTools.str2val(value) return p
def conv(v): return IOTools.str2val(re.sub(",", "", v.strip()))
def main(args=sys.argv): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ global GLOBAL_OPTIONS global GLOBAL_ARGS parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=( "make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force running the pipeline even if there " "are uncommited changes " "in the repository [default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitly set paramater values " "[default=%default].") parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", type="string", help="RabbitMQ exchange to send log messages to " "[default=%default].") parser.add_option("--rabbitmq-host", dest="rabbitmq_host", type="string", help="RabbitMQ host to send log messages to " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.set_defaults( pipeline_action=None, pipeline_format="svg", pipeline_targets=[], multiprocess=40, logfile="pipeline.log", dry_run=False, force=False, log_exceptions=False, exceptions_terminate_immediately=False, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, rabbitmq_host="saruman", rabbitmq_exchange="ruffus_pipelines", input_validation=False) (options, args) = E.Start(parser, add_cluster_options=True) GLOBAL_OPTIONS, GLOBAL_ARGS = options, args E.info("Started in: %s" % PARAMS.get("workingdir")) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. PARAMS["dryrun"] = options.dry_run PARAMS["input_validation"] = options.input_validation # use cli_cluster_* keys in PARAMS to ensure highest priority # of cluster_* options passed with the command-line if options.cluster_memory_default is not None: PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default PARAMS["cluster_memory_default"] = options.cluster_memory_default if options.cluster_memory_resource is not None: PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource PARAMS["cluster_memory_resource"] = options.cluster_memory_resource if options.cluster_num_jobs is not None: PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs PARAMS["cluster_num_jobs"] = options.cluster_num_jobs if options.cluster_options is not None: PARAMS["cli_cluster_options"] = options.cluster_options PARAMS["cluster_options"] = options.cluster_options if options.cluster_parallel_environment is not None: PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment if options.cluster_priority is not None: PARAMS["cli_cluster_priority"] = options.cluster_priority PARAMS["cluster_priority"] = options.cluster_priority if options.cluster_queue is not None: PARAMS["cli_cluster_queue"] = options.cluster_queue PARAMS["cluster_queue"] = options.cluster_queue if options.cluster_queue_manager is not None: PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager PARAMS["cluster_queue_manager"] = options.cluster_queue_manager PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level for variables in options.variables_to_set: variable, value = variables.split("=") PARAMS[variable.strip()] = IOTools.str2val(value.strip()) if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) # see inputValidation function in Parameters.py if options.input_validation: inputValidation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: E.info("\t".join(map(str, requirement))) E.info("version check summary: %s" % str(counter)) E.Stop() return elif options.pipeline_action == "debug": # create the session proxy startSession() method_name = options.pipeline_targets[0] caller = getCaller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "svg", "plot", "touch", "regenerate"): # set up extra file logger handler = logging.FileHandler(filename=options.logfile, mode="a") handler.setFormatter( MultiLineFormatter( '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s')) logger = logging.getLogger() logger.addHandler(handler) messenger = None try: if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = io.StringIO() pipeline_printout( stream, options.pipeline_targets, verbose=5, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterRabbitMQ( stream.getvalue(), project_name=getProjectName(), pipeline_name=getPipelineName(), host=options.rabbitmq_host, exchange=options.rabbitmq_exchange) logger.addFilter(messenger) if not options.without_cluster and HAS_DRMAA: global task # use threading instead of multiprocessing in order to # limit the number of concurrent jobs by using the # GIL # # Note that threading might cause problems with rpy. task.Pool = ThreadPool # create the session proxy startSession() elif not options.without_cluster and not HAS_DRMAA: E.critical("DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # # make sure we are not logging at the same time in # different processes # # session_mutex = manager.Lock() E.info(E.GetHeader()) E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) E.info("Working directory is: %s" % PARAMS["workingdir"]) pipeline_run( options.pipeline_targets, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options.exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, ) E.info(E.GetFooter()) closeSession() elif options.pipeline_action == "show": pipeline_printout( options.stdout, options.pipeline_targets, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, verbose=options.loglevel) elif options.pipeline_action == "svg": pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus_exceptions.RethrownJobError as value: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(value.args)) for idx, e in enumerate(value.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) pass else: task = re.sub("__main__.", "", task) job = re.sub("\s", "", job) if messenger: messenger.send_error(task, job, error, msg) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.logfile) # write full traceback to log file only by removing the stdout # handler lhStdout = logger.handlers[0] logger.removeHandler(lhStdout) logger.error("start of error messages") logger.error(value) logger.error("end of error messages") logger.addHandler(lhStdout) # raise error raise ValueError( "pipeline failed with %i errors" % len(value.args)) else: raise elif options.pipeline_action == "dump": print(json.dumps(PARAMS)) elif options.pipeline_action == "printconfig": print("Printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) printConfigFiles() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.Stop()
def main(args=sys.argv): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import CGAT.Pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- args : list List of command line arguments. """ global GLOBAL_OPTIONS global GLOBAL_ARGS parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=("make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force running the pipeline even if there " "are uncommited changes " "in the repository [default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitly set paramater values " "[default=%default].") parser.add_option("-c", "--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange", type="string", help="RabbitMQ exchange to send log messages to " "[default=%default].") parser.add_option("--rabbitmq-host", dest="rabbitmq_host", type="string", help="RabbitMQ host to send log messages to " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.set_defaults(pipeline_action=None, pipeline_format="svg", pipeline_targets=[], multiprocess=40, logfile="pipeline.log", dry_run=False, force=False, log_exceptions=False, exceptions_terminate_immediately=False, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, rabbitmq_host="saruman", rabbitmq_exchange="ruffus_pipelines", input_validation=False) (options, args) = E.Start(parser, add_cluster_options=True) GLOBAL_OPTIONS, GLOBAL_ARGS = options, args E.info("Started in: %s" % PARAMS.get("workingdir")) # At this point, the PARAMS dictionary has already been # built. It now needs to be updated with selected command # line options as these should always take precedence over # configuration files. PARAMS["dryrun"] = options.dry_run PARAMS["input_validation"] = options.input_validation # use cli_cluster_* keys in PARAMS to ensure highest priority # of cluster_* options passed with the command-line if options.cluster_memory_default is not None: PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default PARAMS["cluster_memory_default"] = options.cluster_memory_default if options.cluster_memory_resource is not None: PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource PARAMS["cluster_memory_resource"] = options.cluster_memory_resource if options.cluster_num_jobs is not None: PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs PARAMS["cluster_num_jobs"] = options.cluster_num_jobs if options.cluster_options is not None: PARAMS["cli_cluster_options"] = options.cluster_options PARAMS["cluster_options"] = options.cluster_options if options.cluster_parallel_environment is not None: PARAMS[ "cli_cluster_parallel_environment"] = options.cluster_parallel_environment PARAMS[ "cluster_parallel_environment"] = options.cluster_parallel_environment if options.cluster_priority is not None: PARAMS["cli_cluster_priority"] = options.cluster_priority PARAMS["cluster_priority"] = options.cluster_priority if options.cluster_queue is not None: PARAMS["cli_cluster_queue"] = options.cluster_queue PARAMS["cluster_queue"] = options.cluster_queue if options.cluster_queue_manager is not None: PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager PARAMS["cluster_queue_manager"] = options.cluster_queue_manager PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level for variables in options.variables_to_set: variable, value = variables.split("=") PARAMS[variable.strip()] = IOTools.str2val(value.strip()) if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) # see inputValidation function in Parameters.py if options.input_validation: inputValidation(PARAMS, sys.argv[0]) if options.pipeline_action == "check": counter, requirements = Requirements.checkRequirementsFromAllModules() for requirement in requirements: E.info("\t".join(map(str, requirement))) E.info("version check summary: %s" % str(counter)) E.Stop() return elif options.pipeline_action == "debug": # create the session proxy startSession() method_name = options.pipeline_targets[0] caller = getCaller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "svg", "plot", "touch", "regenerate"): # set up extra file logger handler = logging.FileHandler(filename=options.logfile, mode="a") handler.setFormatter( MultiLineFormatter( '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s' )) logger = logging.getLogger() logger.addHandler(handler) messenger = None try: if options.pipeline_action == "make": # get tasks to be done. This essentially replicates # the state information within ruffus. stream = io.StringIO() pipeline_printout( stream, options.pipeline_targets, verbose=5, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterRabbitMQ( stream.getvalue(), project_name=getProjectName(), pipeline_name=getPipelineName(), host=options.rabbitmq_host, exchange=options.rabbitmq_exchange) logger.addFilter(messenger) if not options.without_cluster and HAS_DRMAA: global task # use threading instead of multiprocessing in order to # limit the number of concurrent jobs by using the # GIL # # Note that threading might cause problems with rpy. task.Pool = ThreadPool # create the session proxy startSession() elif not options.without_cluster and not HAS_DRMAA: E.critical( "DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # # make sure we are not logging at the same time in # different processes # # session_mutex = manager.Lock() E.info(E.GetHeader()) E.info("code location: %s" % PARAMS["pipeline_scriptsdir"]) E.info("Working directory is: %s" % PARAMS["workingdir"]) pipeline_run( options.pipeline_targets, multiprocess=options.multiprocess, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, ) E.info(E.GetFooter()) closeSession() elif options.pipeline_action == "show": pipeline_printout( options.stdout, options.pipeline_targets, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": pipeline_run(options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": pipeline_run(options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, verbose=options.loglevel) elif options.pipeline_action == "svg": pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus_exceptions.RethrownJobError as value: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(value.args)) for idx, e in enumerate(value.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) pass else: task = re.sub("__main__.", "", task) job = re.sub("\s", "", job) if messenger: messenger.send_error(task, job, error, msg) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.logfile) # write full traceback to log file only by removing the stdout # handler lhStdout = logger.handlers[0] logger.removeHandler(lhStdout) logger.error("start of error messages") logger.error(value) logger.error("end of error messages") logger.addHandler(lhStdout) # raise error raise ValueError("pipeline failed with %i errors" % len(value.args)) else: raise elif options.pipeline_action == "dump": print(json.dumps(PARAMS)) elif options.pipeline_action == "printconfig": print("Printing out pipeline parameters: ") for k in sorted(PARAMS): print(k, "=", PARAMS[k]) printConfigFiles() elif options.pipeline_action == "config": f = sys._getframe(1) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") writeConfigFiles(pipeline_path, general_path) elif options.pipeline_action == "clone": clonePipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.Stop()