def auto_config(extension, inputs, job_post_process_config_file, config_file, verbose): """Automatically create a configuration.""" level = logging.DEBUG if verbose else logging.WARNING setup_logging("auto_config", None, console_level=level) if job_post_process_config_file is not None: module, class_name, data = JobPostProcess.load_config_from_file( job_post_process_config_file) JobPostProcess(module, class_name, data) # ensure everything ok job_post_process_config = { "module": module, "class": class_name, "data": data } else: job_post_process_config = None # User extension registry = Registry() if not registry.is_registered(extension): raise InvalidExtension(f"Extension '{extension}' is not registered.") cli = registry.get_extension_class(extension, ExtensionClassType.CLI) config = cli.auto_config(*inputs, job_post_process_config=job_post_process_config) print(f"Created configuration with {config.get_num_jobs()} jobs.") config.dump(config_file) print(f"Dumped configuration to {config_file}.\n")
def test_registry__add_logger(registry_fixture): registry = Registry(registry_filename=TEST_FILENAME) registry.reset_defaults() package = "test-package" registry.add_logger(package) assert package in registry.list_loggers() registry.remove_logger(package) assert package not in registry.list_loggers()
def __init__( self, container=None, job_global_config=None, job_post_process_config=None, user_data=None, submission_groups=None, setup_command=None, teardown_command=None, node_setup_command=None, node_teardown_command=None, **kwargs, ): """ Constructs JobConfiguration. Parameters ---------- inputs : JobInputsInterface container : JobContainerInterface """ self._jobs = container or JobContainerByName() self._job_names = None self._jobs_directory = kwargs.get("jobs_directory") self._registry = Registry() self._job_global_config = job_global_config self._job_post_process_config = job_post_process_config self._user_data = user_data or {} self._submission_groups = [ SubmissionGroup(**x) for x in submission_groups or [] ] self._setup_command = setup_command self._teardown_command = teardown_command self._node_setup_command = node_setup_command self._node_teardown_command = node_teardown_command if kwargs.get("do_not_deserialize_jobs", False): assert "job_names" in kwargs, str(kwargs) self._job_names = kwargs["job_names"] return if "jobs" in kwargs: self._deserialize_jobs(kwargs["jobs"]) elif "job_names" in kwargs: assert self._jobs_directory is not None, str(kwargs) names = kwargs["job_names"] self._deserialize_jobs_from_names(names)
def deserialize_config(data, **kwargs): """Create instance of a JobConfiguration from a dict. Parameters ---------- data : dict Dictionary loaded from a serialized config file. Returns ------- JobConfiguration """ registry = Registry() extension = data["extension"] cls = registry.get_extension_class(extension, ExtensionClassType.CONFIGURATION) return cls.deserialize(data, **kwargs)
def __init__(self, inputs, container, job_parameters_class, extension_name, job_global_config=None, job_post_process_config=None, batch_post_process_config=None, **kwargs): """ Constructs JobConfiguration. Parameters ---------- inputs : JobInputsInterface container : JobContainerInterface """ self._extension_name = extension_name self._inputs = inputs self._jobs = container self._job_parameters_class = job_parameters_class self._job_names = None self._jobs_directory = kwargs.get("jobs_directory") self._registry = Registry() self._job_global_config = job_global_config self._job_post_process_config = job_post_process_config self._batch_post_process_config = batch_post_process_config if kwargs.get("do_not_deserialize_jobs", False): assert "job_names" in kwargs, str(kwargs) self._job_names = kwargs["job_names"] return if "jobs" in kwargs: self._deserialize_jobs(kwargs["jobs"]) elif "job_names" in kwargs: assert self._jobs_directory is not None, str(kwargs) names = kwargs["job_names"] self._deserialize_jobs_from_names(names)
def test_registry__reset_defaults(registry_fixture): registry = Registry(registry_filename=TEST_FILENAME) clear_extensions(registry) registry.reset_defaults() assert len(registry.list_extensions()) == len( DEFAULT_REGISTRY["extensions"]) assert registry.list_loggers() == DEFAULT_REGISTRY["logging"]
def test_registry__show_extensions(capsys, registry_fixture): """Test functionality of show_extensions.""" registry = Registry(registry_filename=TEST_FILENAME) registry.reset_defaults() registry.show_extensions() captured = capsys.readouterr() for extension in DEFAULT_REGISTRY["extensions"]: assert extension["name"] in captured.out
def deserialize_config(data, **kwargs): """Create instance of a JobConfiguration from a dict. Parameters ---------- data : dict Dictionary loaded from a serialized config file. Returns ------- JobConfiguration """ registry = Registry() config_module = data["configuration_module"] config_class = data["configuration_class"] for ext in registry.iter_extensions(): ext_cfg_class = ext[ExtensionClassType.CONFIGURATION] if ext_cfg_class.__module__ == config_module and ext_cfg_class.__name__ == config_class: return ext_cfg_class.deserialize(data, **kwargs) raise InvalidParameter( f"Cannot deserialize {config_module}.{config_class}")
def remove_logger(package_name): """Remove logging for a package.""" registry = Registry() registry.remove_logger(package_name)
def add_logger(package_name): """Add logging for a package.""" registry = Registry() registry.add_logger(package_name)
def unregister(extension): """Unregister an extension.""" registry = Registry() registry.unregister_extension(extension)
def reset_defaults(): """Reset registry to its default values.""" Registry().reset_defaults()
def register(extension_file): """Register one or more extensions.""" registry = Registry() for extension in load_data(extension_file): registry.register_extension(extension)
def setup_logging(name, filename, console_level=logging.INFO, file_level=logging.INFO, packages=None): """Configures logging to file and console. Parameters ---------- name : str logger name filename : str | None log filename console_level : int, optional console log level file_level : int, optional file log level packages : list, optional enable logging for these package names """ log_config = { "version": 1, "disable_existing_loggers": False, "formatters": { "basic": { "format": "%(message)s" }, "short": { "format": "%(asctime)s - %(levelname)s [%(name)s " "%(filename)s:%(lineno)d] : %(message)s", }, "detailed": { "format": "%(asctime)s - %(levelname)s [%(name)s " "%(filename)s:%(lineno)d] : %(message)s", }, }, "handlers": { "console": { "level": console_level, "formatter": "short", "class": "logging.StreamHandler", }, "file": { "class": "logging.FileHandler", "level": file_level, "filename": filename, "mode": "w", "formatter": "detailed", }, "structured_file": { "class": "logging.FileHandler", "level": file_level, "filename": filename, "mode": "a", "formatter": "basic" } }, "loggers": { name: { "handlers": ["console", "file"], "level": "DEBUG", "propagate": False }, "event": { "handlers": ["console", "structured_file"], "level": "DEBUG", "propagate": False } }, #"root": { # "handlers": ["console", "file"], # "level": "WARN", #}, } logging_packages = set(Registry().list_loggers()) if packages is not None: for package in packages: logging_packages.add(package) for package in logging_packages: log_config["loggers"][package] = { "handlers": ["console", "file"], "level": "DEBUG", "propagate": False, } if filename is None: log_config["handlers"].pop("file") log_config["loggers"][name]["handlers"].remove("file") for package in logging_packages: log_config["loggers"][package]["handlers"].remove("file") # For event logging if name == "event": log_config["handlers"].pop("file") for package in logging_packages: log_config["loggers"].pop(package) else: log_config["handlers"].pop("structured_file") log_config["loggers"]["event"]["handlers"].remove("structured_file") logging.config.dictConfig(log_config) logger = logging.getLogger(name) return logger
def test_registry__is_registered(registry_fixture): registry = Registry(registry_filename=TEST_FILENAME) registry.reset_defaults() assert registry.is_registered(DEFAULT_REGISTRY["extensions"][0]["name"])
def test_registry__list_extensions(registry_fixture): registry = Registry(registry_filename=TEST_FILENAME) registry.reset_defaults() assert len(registry.list_extensions()) == len( DEFAULT_REGISTRY["extensions"])
def run(extension, **kwargs): """Runs individual job.""" registry = Registry() if not registry.is_registered(extension): raise InvalidExtension(f"Extension '{extension}' is not registered.") # Parse Argument config_file = kwargs["config_file"] name = kwargs["name"] output = kwargs["output"] output_format = kwargs["output_format"] verbose = kwargs["verbose"] level = logging.DEBUG if verbose else logging.INFO # Create directory for current job job_dir = os.path.join(output, name) os.makedirs(job_dir, exist_ok=True) # Structural logging setup event_file = os.path.join(job_dir, "events.log") setup_event_logging(event_file) # General logging setup log_file = os.path.join(job_dir, "run.log") general_logger = setup_logging( extension, log_file, console_level=logging.ERROR, file_level=level, ) general_logger.info(get_cli_string()) # Create config for run try: cli = registry.get_extension_class(extension, ExtensionClassType.CLI) ret = cli.run(config_file, name, output, output_format, verbose) except Exception as err: msg = f"unexpected exception in run '{extension}' job={name} - {err}" general_logger.exception(msg) event = StructuredErrorLogEvent( source=name, category=EVENT_CATEGORY_ERROR, name=EVENT_NAME_UNHANDLED_ERROR, message=msg, ) log_event(event) ret = 1 if ret == 0: try: config = load_data(config_file) if "job_post_process_config" in config.keys(): post_process = JobPostProcess( module_name=config["job_post_process_config"]["module"], class_name=config["job_post_process_config"]["class"], data=config["job_post_process_config"]["data"], job_name=name, output=output, ) post_process.run(config_file=config_file, output=output) except Exception as err: msg = f"unexpected exception in post-process '{extension}' job={name} - {err}" general_logger.exception(msg) event = StructuredErrorLogEvent( source=name, category=EVENT_CATEGORY_ERROR, name=EVENT_NAME_UNHANDLED_ERROR, message=msg, ) log_event(event) ret = 1 sys.exit(ret)
class JobConfiguration(abc.ABC): """Base class for any simulation configuration.""" FILENAME_DELIMITER = "_" def __init__(self, inputs, container, job_parameters_class, extension_name, job_global_config=None, job_post_process_config=None, batch_post_process_config=None, **kwargs): """ Constructs JobConfiguration. Parameters ---------- inputs : JobInputsInterface container : JobContainerInterface """ self._extension_name = extension_name self._inputs = inputs self._jobs = container self._job_parameters_class = job_parameters_class self._job_names = None self._jobs_directory = kwargs.get("jobs_directory") self._registry = Registry() self._job_global_config = job_global_config self._job_post_process_config = job_post_process_config self._batch_post_process_config = batch_post_process_config if kwargs.get("do_not_deserialize_jobs", False): assert "job_names" in kwargs, str(kwargs) self._job_names = kwargs["job_names"] return if "jobs" in kwargs: self._deserialize_jobs(kwargs["jobs"]) elif "job_names" in kwargs: assert self._jobs_directory is not None, str(kwargs) names = kwargs["job_names"] self._deserialize_jobs_from_names(names) def __repr__(self): """Concisely display all instance information.""" return self.dumps() def _deserialize_jobs(self, jobs): for job_ in jobs: job = self._job_parameters_class.deserialize(job_) self.add_job(job) def _deserialize_jobs_from_names(self, job_names): for name in job_names: job = self._get_job_by_name(name) self.add_job(job) def _dump(self, stream=sys.stdout, fmt=".json", indent=2): # Note: the default is JSON here because parsing 100 MB .toml files # is an order of magnitude slower. data = self.serialize() if fmt == ".json": json.dump(data, stream, indent=indent, cls=ExtendedJSONEncoder) elif fmt == ".toml": toml.dump(data, stream) else: assert False, fmt def _get_job_by_name(self, name): assert self._jobs_directory is not None filename = os.path.join(self._jobs_directory, name) + ".json" assert os.path.exists(filename), filename return self._job_parameters_class.deserialize(load_data(filename)) @abc.abstractmethod def _serialize(self, data): """Create implementation-specific data for serialization.""" def check_job_dependencies(self): """Check for impossible conditions with job dependencies. Raises ------ InvalidConfiguration Raised if job dependencies have an impossible condition. """ # This currently only checks that all jobs defined as blocking exist. # It does not look for deadlocks. job_names = set() blocking_jobs = set() for job in self.iter_jobs(): job_names.add(job.name) blocking_jobs.update(job.get_blocking_jobs()) missing_jobs = blocking_jobs.difference(job_names) if missing_jobs: for job in missing_jobs: logger.error("%s is blocking a job but does not exist", job) raise InvalidConfiguration("job ordering definitions are invalid") @abc.abstractmethod def create_from_result(self, job, output_dir): """Create an instance from a result file. Parameters ---------- job : JobParametersInterface output_dir : str Returns ------- class """ @property def extension_name(self): """Return the extension name for the configuration.""" return self._extension_name @abc.abstractmethod def get_job_inputs(self): """Return the inputs required to run a job.""" def add_job(self, job): """Add a job to the configuration. Parameters ---------- job : JobParametersInterface """ self._jobs.add_job(job) def clear(self): """Clear all configured jobs.""" self._jobs.clear() @timed_debug def dump(self, filename=None, stream=sys.stdout, indent=2): """Convert the configuration to structured text format. Parameters ---------- filename : str | None Write configuration to this file (must be .json or .toml). If None, write the text to stream. Recommend using .json for large files. .toml is much slower. stream : file File-like interface that supports write(). indent : int If JSON, use this indentation. Raises ------ InvalidParameter Raised if filename does not have a supported extenstion. """ if filename is None and stream is None: raise InvalidParameter("must set either filename or stream") if filename is not None: ext = os.path.splitext(filename)[1] if ext not in (".json", ".toml"): raise InvalidParameter("Only .json and .toml are supported") with open(filename, "w") as f_out: self._dump(f_out, fmt=ext, indent=indent) else: self._dump(stream, indent=indent) logger.info("Dumped configuration to %s", filename) def dumps(self, fmt_module=toml, **kwargs): """Dump the configuration to a formatted string.""" return fmt_module.dumps(self.serialize(), **kwargs) @classmethod def deserialize(cls, filename_or_data, do_not_deserialize_jobs=False): """Create a class instance from a saved configuration file. Parameters ---------- filename : str | dict path to configuration file or that file loaded as a dict do_not_deserialize_jobs : bool Set to True to avoid the overhead of loading all jobs from disk. Job_names will be stored instead of jobs. Returns ------- class Raises ------ InvalidParameter Raised if the config file has invalid parameters. """ if isinstance(filename_or_data, str): data = load_data(filename_or_data) else: data = filename_or_data # Don't create an inputs object. It can be very expensive and we don't # need it unless the user wants to change the config. # TODO: implement user-friendly error messages when they try to access # inputs. inputs = None data["do_not_deserialize_jobs"] = do_not_deserialize_jobs return cls(inputs, **data) def get_job(self, name): """Return the job matching name. Returns ------- namedtuple """ if self.get_num_jobs() == 0 and self._job_names is not None: # We loaded from a config file with names only. return self._get_job_by_name(name) return self._jobs.get_job(name) def get_parameters_class(self): """Return the class used for job parameters.""" return self._job_parameters_class def get_num_jobs(self): """Return the number of jobs in the configuration. Returns ------- int """ return self._jobs.get_num_jobs() @property def job_global_config(self): """Return the global configs applied to all jobs.""" return self._job_global_config @property def job_post_process_config(self): """Return post process config for jobs""" return self._job_post_process_config @property def batch_post_process_config(self): """Return batch post process config for task""" return self._batch_post_process_config @batch_post_process_config.setter def batch_post_process_config(self, data): self._batch_post_process_config = data @property def inputs(self): """Return the instance of JobInputsInterface for the job.""" return self._inputs def iter_jobs(self): """Yields a generator over all jobs. Yields ------ iterator over JobParametersInterface """ return self._jobs.iter_jobs() @timed_debug def list_jobs(self): """Return a list of all jobs. Returns ------ list list of JobParametersInterface """ return list(self.iter_jobs()) @timed_debug def reconfigure_jobs(self, jobs): """Reconfigure with a list of jobs. Parameters ---------- list of DistributionConfiguration.parameter_type """ self.clear() for job in jobs: self.add_job(job) logger.info("Reconfigured jobs.") def remove_job(self, job): """Remove a job from the configuration. Parameters ---------- job : JobParametersInterface """ return self._jobs.remove_job(job) def run_job(self, job, output, **kwargs): """Run the job. Parameters ---------- job : JobParametersInterface output : str output directory Returns ------- int """ logger.debug("job=%s kwargs=%s", job, kwargs) cls = self.job_execution_class() job_execution = cls.create(self.get_job_inputs(), job, output) return job_execution.run(**kwargs) def serialize(self, include=ConfigSerializeOptions.JOBS): """Create data for serialization.""" data = { "class": self.__class__.__name__, "extension": self.extension_name, "jobs_directory": self._jobs_directory, } if self._job_global_config: data["job_global_config"] = self._job_global_config if self._job_post_process_config: data["job_post_process_config"] = self._job_post_process_config if self._batch_post_process_config: data["batch_post_process_config"] = self._batch_post_process_config if include == ConfigSerializeOptions.JOBS: data["jobs"] = [x.serialize() for x in self.iter_jobs()] elif include == ConfigSerializeOptions.JOB_NAMES: data["job_names"] = [x.name for x in self.iter_jobs()] # Fill in instance-specific information. self._serialize(data) return data def serialize_jobs(self, directory): """Serializes main job data to job-specific files. Parameters ---------- output_dir : str """ for job in self.iter_jobs(): basename = job.name + ".json" job_filename = os.path.join(directory, basename) dump_data(job.serialize(), job_filename, cls=ExtendedJSONEncoder) # We will need this to deserialize from a filename that includes only # job names. self._jobs_directory = directory def serialize_for_execution(self, scratch_dir, are_inputs_local=True): """Serialize config data for efficient execution. Parameters ---------- scratch_dir : str Temporary storage space on the local system. are_inputs_local : bool Whether the existing input data is local to this system. For many configurations accessing the input data across the network by many concurrent workers can cause a bottleneck and so implementations may wish to copy the data locally before execution starts. If the storage access time is very fast the question is irrelevant. Returns ------- str Name of serialized config file in scratch directory. """ self._transform_for_local_execution(scratch_dir, are_inputs_local) # Split up the jobs to individual files so that each worker can just # read its own info. self.serialize_jobs(scratch_dir) data = self.serialize(ConfigSerializeOptions.JOB_NAMES) config_file = os.path.join(scratch_dir, CONFIG_FILE) dump_data(data, config_file) logger.info("Dumped config file locally to %s", config_file) return config_file def _transform_for_local_execution(self, scratch_dir, are_inputs_local): """Transform data for efficient execution in a local environment. Default implementation is a no-op. Derived classes can overridde. """ def show_jobs(self): """Show the configured jobs.""" for job in self.iter_jobs(): print(job) def job_execution_class(self): """Return the class used for job execution. Returns ------- class """ return self._registry.get_extension_class(self.extension_name, ExtensionClassType.EXECUTION)
def show(): """Show the available extensions (job types).""" print("Extensions:") Registry().show_extensions() print("Logging enabled for packages: ", end="") Registry().show_loggers()
def submit_jobs(self, name="job", per_node_batch_size=DEFAULTS["per_node_batch_size"], max_nodes=DEFAULTS["max_nodes"], force_local=False, verbose=False, poll_interval=DEFAULTS["poll_interval"], num_processes=None, previous_results=None, reports=True, try_add_blocked_jobs=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- name : str batch name, applies to HPC job submission only per_node_batch_size : int Number of jobs to run on one node in one batch. max_nodes : int Max number of node submission requests to make in parallel. force_local : bool If on HPC, run jobs through subprocess as if local. wait : bool Don't return until HPC jobs have finished. verbose : bool Enable debug logging. poll_interval : int Inteval in seconds on which to poll jobs. num_processes : int Number of processes to run in parallel; defaults to num CPUs Returns ------- Status """ logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) self._config.check_job_dependencies() self._hpc = HpcManager(self._hpc_config_file, self._output) result = Status.GOOD # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) start_time = time.time() if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) result = runner.run_jobs(verbose=verbose, num_processes=num_processes) else: self._submit_to_hpc(name, max_nodes, per_node_batch_size, verbose, poll_interval, num_processes, try_add_blocked_jobs) results_summary = ResultsAggregatorSummary(self._results_dir) self._results = results_summary.get_results() if len(self._results) != self._config.get_num_jobs(): logger.error( "Number of results doesn't match number of jobs: " "results=%s jobs=%s. Check for process crashes " "or HPC timeouts.", len(self._results), self._config.get_num_jobs()) result = Status.ERROR if previous_results: self._results += previous_results self.write_results(RESULTS_FILE) results_summary.delete_files() shutil.rmtree(self._results_dir) self._log_error_log_messages(self._output) bytes_consumed = get_directory_size_bytes(self._output, recursive=False) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="main output directory size", bytes_consumed=bytes_consumed, ) log_event(event) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_CONFIG_EXEC_SUMMARY, message="config execution summary", config_execution_time=time.time() - start_time, num_jobs=self.get_num_jobs(), ) log_event(event) if reports: self.generate_reports(self._output) return result
class JobConfiguration(abc.ABC): """Base class for any simulation configuration.""" FILENAME_DELIMITER = "_" FORMAT_VERSION = "v0.2.0" def __init__( self, container=None, job_global_config=None, job_post_process_config=None, user_data=None, submission_groups=None, setup_command=None, teardown_command=None, node_setup_command=None, node_teardown_command=None, **kwargs, ): """ Constructs JobConfiguration. Parameters ---------- inputs : JobInputsInterface container : JobContainerInterface """ self._jobs = container or JobContainerByName() self._job_names = None self._jobs_directory = kwargs.get("jobs_directory") self._registry = Registry() self._job_global_config = job_global_config self._job_post_process_config = job_post_process_config self._user_data = user_data or {} self._submission_groups = [ SubmissionGroup(**x) for x in submission_groups or [] ] self._setup_command = setup_command self._teardown_command = teardown_command self._node_setup_command = node_setup_command self._node_teardown_command = node_teardown_command if kwargs.get("do_not_deserialize_jobs", False): assert "job_names" in kwargs, str(kwargs) self._job_names = kwargs["job_names"] return if "jobs" in kwargs: self._deserialize_jobs(kwargs["jobs"]) elif "job_names" in kwargs: assert self._jobs_directory is not None, str(kwargs) names = kwargs["job_names"] self._deserialize_jobs_from_names(names) def __repr__(self): """Concisely display all instance information.""" return self.dumps() def _deserialize_jobs(self, jobs): for _job in jobs: param_class = self.job_parameters_class(_job["extension"]) job = param_class.deserialize(_job) self.add_job(job) def _deserialize_jobs_from_names(self, job_names): for name in job_names: job = self._get_job_by_name(name) self.add_job(job) def _dump(self, stream=sys.stdout, fmt=".json", indent=2): # Note: the default is JSON here because parsing 100 MB .toml files # is an order of magnitude slower. data = self.serialize() if fmt == ".json": json.dump(data, stream, indent=indent, cls=ExtendedJSONEncoder) elif fmt == ".toml": toml.dump(data, stream) else: assert False, fmt def _get_job_by_name(self, name): assert self._jobs_directory is not None filename = os.path.join(self._jobs_directory, name) + ".json" assert os.path.exists(filename), filename job = load_data(filename) param_class = self.job_parameters_class(job["extension"]) return param_class.deserialize(job) @abc.abstractmethod def _serialize(self, data): """Create implementation-specific data for serialization.""" def add_user_data(self, key, data): """Add user data referenced by a key. Must be JSON-serializable Parameters ---------- key : str data : any Raises ------ InvalidParameter Raised if the key is already stored. """ if key in self._user_data: raise InvalidParameter( f"{key} is already stored. Call remove_user_data first") self._user_data[key] = data def get_user_data(self, key): """Get the user data associated with key. Parameters ---------- key : str Returns ------- any """ data = self._user_data.get(key) if data is None: raise InvalidParameter(f"{key} is not stored.") return data def remove_user_data(self, key): """Remove the key from the user data config. Parameters ---------- key : str """ self._user_data.pop(key, None) def list_user_data_keys(self): """List the stored user data keys. Returns ------- list list of str """ return sorted(list(self._user_data.keys())) def check_job_dependencies(self, submitter_params): """Check for impossible conditions with job dependencies. Parameters ---------- submitter_params : SubmitterParams Raises ------ InvalidConfiguration Raised if job dependencies have an impossible condition. """ requires_estimated_time = submitter_params.per_node_batch_size == 0 # This currently only checks that all jobs defined as blocking exist. # It does not look for deadlocks. job_names = set() blocking_jobs = set() missing_estimate = [] for job in self.iter_jobs(): job_names.add(job.name) blocking_jobs.update(job.get_blocking_jobs()) if requires_estimated_time and job.estimated_run_minutes is None: missing_estimate.append(job.name) missing_jobs = blocking_jobs.difference(job_names) if missing_jobs: for job in missing_jobs: logger.error("%s is blocking a job but does not exist", job) raise InvalidConfiguration("job ordering definitions are invalid") if missing_estimate: for job in missing_estimate: logger.error("Job %s does not define estimated_run_minutes", job) raise InvalidConfiguration( "Submitting batches by time requires that each job define estimated_run_minutes" ) def check_job_runtimes(self): """Check for any job with a longer estimated runtime than the walltime. Raises ------ InvalidConfiguration Raised if any job is too long. """ wall_times = { x.name: x.submitter_params.get_wall_time() for x in self.submission_groups } for job in self.iter_jobs(): wall_time = wall_times[job.submission_group] if job.estimated_run_minutes is not None: estimate = timedelta(minutes=job.estimated_run_minutes) if estimate > wall_time: raise InvalidConfiguration( f"job {job.name} has estimated_run_minutes={estimate} longer than wall_time={wall_time}" ) def check_spark_config(self): """If Spark jobs are present in the config, configure the params to run one job at a time. """ groups_with_spark_jobs = set() for job in self.iter_jobs(): if job.is_spark_job(): groups_with_spark_jobs.add(job.submission_group) for group_name in groups_with_spark_jobs: for group in self._submission_groups: if group.name == group_name and group.submitter_params.num_processes != 1: group.submitter_params.num_processes = 1 logger.info( "Set num_processes=1 for group=%s for Spark jobs.", group_name) def check_submission_groups(self, submitter_params): """Check for invalid job submission group assignments. Make a default group if none are defined and assign it to each job. Parameters ---------- submitter_params : SubmitterParams Raises ------ InvalidConfiguration Raised if submission group assignments are invalid. """ groups = self.submission_groups if not groups: self._assign_default_submission_group(submitter_params) return first_group = next(iter(groups)) group_params = ( "try_add_blocked_jobs", "time_based_batching", "num_processes", "hpc_config", "per_node_batch_size", "singularity_params", "distributed_submitter", ) user_overrides = ( "distributed_submitter", "generate_reports", "resource_monitor_interval", "resource_monitor_type", "dry_run", "verbose", ) user_override_if_not_set = ("node_setup_script", "node_shutdown_script") must_be_same = ("max_nodes", "poll_interval") all_params = (must_be_same, group_params, user_overrides, user_override_if_not_set) fields = {item for params in all_params for item in params} assert sorted(list(fields)) == sorted( SubmitterParams.__fields__), sorted(list(fields)) hpc_type = first_group.submitter_params.hpc_config.hpc_type group_names = set() for group in groups: if group.name in group_names: raise InvalidConfiguration( f"submission group {group.name} is listed twice") group_names.add(group.name) if group.submitter_params.hpc_config.hpc_type != hpc_type: raise InvalidConfiguration( f"hpc_type values must be the same in all groups") for param in must_be_same: first_val = getattr(first_group.submitter_params, param) this_val = getattr(group.submitter_params, param) if this_val != first_val: raise InvalidConfiguration( f"{param} must be the same in all groups") for param in user_overrides: user_val = getattr(submitter_params, param) setattr(group.submitter_params, param, user_val) for param in user_override_if_not_set: user_val = getattr(submitter_params, param) group_val = getattr(group.submitter_params, param) if group_val is None: setattr(group.submitter_params, param, user_val) jobs_by_group = defaultdict(list) for job in self.iter_jobs(): if job.submission_group is None: raise InvalidConfiguration( f"Job {job.name} does not have a submission group assigned" ) if job.submission_group not in group_names: raise InvalidConfiguration( f"Job {job.name} has an invalid submission group: {job.submission_group}" ) jobs_by_group[job.submission_group].append(job.name) group_counts = {} for name, jobs in jobs_by_group.items(): if not jobs: logger.warning( "Submission group %s does not have any jobs defined", name) group_counts[name] = len(jobs) for name, count in sorted(group_counts.items()): logger.info("Submission group %s has %s jobs", name, count) def _assign_default_submission_group(self, submitter_params): default_name = "default" group = SubmissionGroup(name=default_name, submitter_params=submitter_params) for job in self.iter_jobs(): job.submission_group = group.name self.append_submission_group(group) @abc.abstractmethod def create_from_result(self, job, output_dir): """Create an instance from a result file. Parameters ---------- job : JobParametersInterface output_dir : str Returns ------- class """ def add_job(self, job): """Add a job to the configuration. Parameters ---------- job : JobParametersInterface """ self._jobs.add_job(job) def clear(self): """Clear all configured jobs.""" self._jobs.clear() @timed_debug def dump(self, filename=None, stream=sys.stdout, indent=2): """Convert the configuration to structured text format. Parameters ---------- filename : str | None Write configuration to this file (must be .json or .toml). If None, write the text to stream. Recommend using .json for large files. .toml is much slower. stream : file File-like interface that supports write(). indent : int If JSON, use this indentation. Raises ------ InvalidParameter Raised if filename does not have a supported extenstion. """ if filename is None and stream is None: raise InvalidParameter("must set either filename or stream") if filename is not None: ext = os.path.splitext(filename)[1] if ext not in (".json", ".toml"): raise InvalidParameter("Only .json and .toml are supported") with open(filename, "w") as f_out: self._dump(f_out, fmt=ext, indent=indent) else: self._dump(stream, indent=indent) logger.info("Dumped configuration to %s", filename) def dumps(self, fmt_module=toml, **kwargs): """Dump the configuration to a formatted string.""" return fmt_module.dumps(self.serialize(), **kwargs) @classmethod def deserialize(cls, filename_or_data, do_not_deserialize_jobs=False): """Create a class instance from a saved configuration file. Parameters ---------- filename : str | dict path to configuration file or that file loaded as a dict do_not_deserialize_jobs : bool Set to True to avoid the overhead of loading all jobs from disk. Job_names will be stored instead of jobs. Returns ------- class Raises ------ InvalidParameter Raised if the config file has invalid parameters. """ if isinstance(filename_or_data, str): data = load_data(filename_or_data) else: data = filename_or_data data["do_not_deserialize_jobs"] = do_not_deserialize_jobs return cls(**data) def get_job(self, name): """Return the job matching name. Returns ------- namedtuple """ if self.get_num_jobs() == 0 and self._job_names is not None: # We loaded from a config file with names only. return self._get_job_by_name(name) return self._jobs.get_job(name) def get_num_jobs(self): """Return the number of jobs in the configuration. Returns ------- int """ return len(self._jobs) @property def job_global_config(self): """Return the global configs applied to all jobs.""" return self._job_global_config def iter_jobs(self): """Yields a generator over all jobs. Yields ------ iterator over JobParametersInterface """ return iter(self._jobs) @timed_debug def list_jobs(self): """Return a list of all jobs. Returns ------ list list of JobParametersInterface """ return list(self.iter_jobs()) def append_submission_group(self, submission_group): """Append a submission group. Parameters ---------- submission_group : SubmissionGroup """ self._submission_groups.append(submission_group) logger.info("Added submission group %s", submission_group.name) def get_default_submission_group(self): """Return the default submission group. Returns ------- SubmissionGroup """ name = next(iter(self.iter_jobs())).submission_group return self.get_submission_group(name) def get_submission_group(self, name): """Return the submission group matching name. Parameters ---------- name : str Returns ------- SubmissionGroup """ for group in self.submission_groups: if group.name == name: return group raise InvalidParameter(f"submission group {name} is not stored") @property def submission_groups(self): """Return the submission groups. Returns ------- list """ return self._submission_groups @timed_debug def reconfigure_jobs(self, jobs): """Reconfigure with a list of jobs. Parameters ---------- list of DistributionConfiguration.parameter_type """ self.clear() for job in jobs: self.add_job(job) logger.info("Reconfigured jobs.") def remove_job(self, job): """Remove a job from the configuration. Parameters ---------- job : JobParametersInterface """ return self._jobs.remove_job(job) def serialize(self, include=ConfigSerializeOptions.JOBS): """Create data for serialization.""" data = { "jobs_directory": self._jobs_directory, "configuration_module": self.__class__.__module__, "configuration_class": self.__class__.__name__, "format_version": self.FORMAT_VERSION, "user_data": self._user_data, "submission_groups": [x.dict() for x in self.submission_groups], "setup_command": self.setup_command, "teardown_command": self.teardown_command, "node_setup_command": self.node_setup_command, "node_teardown_command": self.node_teardown_command, } if self._job_global_config: data["job_global_config"] = self._job_global_config if self._job_post_process_config: data["job_post_process_config"] = self._job_post_process_config if include == ConfigSerializeOptions.JOBS: data["jobs"] = [x.serialize() for x in self.iter_jobs()] elif include == ConfigSerializeOptions.JOB_NAMES: data["job_names"] = [x.name for x in self.iter_jobs()] # Fill in instance-specific information. self._serialize(data) return data def serialize_jobs(self, directory): """Serializes main job data to job-specific files. Parameters ---------- directory : str """ for job in self.iter_jobs(): basename = job.name + ".json" job_filename = os.path.join(directory, basename) dump_data(job.serialize(), job_filename, cls=ExtendedJSONEncoder) # We will need this to deserialize from a filename that includes only # job names. self._jobs_directory = directory def serialize_for_execution(self, scratch_dir, are_inputs_local=True): """Serialize config data for efficient execution. Parameters ---------- scratch_dir : str Temporary storage space on the local system. are_inputs_local : bool Whether the existing input data is local to this system. For many configurations accessing the input data across the network by many concurrent workers can cause a bottleneck and so implementations may wish to copy the data locally before execution starts. If the storage access time is very fast the question is irrelevant. Returns ------- str Name of serialized config file in scratch directory. """ self._transform_for_local_execution(scratch_dir, are_inputs_local) # Split up the jobs to individual files so that each worker can just # read its own info. self.serialize_jobs(scratch_dir) data = self.serialize(ConfigSerializeOptions.JOB_NAMES) config_file = os.path.join(scratch_dir, CONFIG_FILE) dump_data(data, config_file, cls=ExtendedJSONEncoder) logger.info("Dumped config file locally to %s", config_file) return config_file @property def setup_command(self): """Command to run by submitter before submitting jobs""" return self._setup_command @setup_command.setter def setup_command(self, cmd): """Set command to run by submitter before submitting jobs""" self._setup_command = cmd @property def teardown_command(self): """Command to run by last node before completing jobs""" return self._teardown_command @teardown_command.setter def teardown_command(self, cmd): """Set command to run by last node before completing jobs""" self._teardown_command = cmd @property def node_setup_command(self): """Command to run on each node before starting jobs""" return self._node_setup_command @node_setup_command.setter def node_setup_command(self, cmd): """Set command to run on each node before starting jobs""" self._node_setup_command = cmd @property def node_teardown_command(self): """Command to run on each node after completing jobs""" return self._node_teardown_command @node_teardown_command.setter def node_teardown_command(self, cmd): """Set command to run on each node after completing jobs""" self._node_teardown_command = cmd def _transform_for_local_execution(self, scratch_dir, are_inputs_local): """Transform data for efficient execution in a local environment. Default implementation is a no-op. Derived classes can overridde. """ def shuffle_jobs(self): """Shuffle the job order.""" self._jobs.shuffle() def show_jobs(self): """Show the configured jobs.""" for job in self.iter_jobs(): print(job) def job_execution_class(self, extension_name): """Return the class used for job execution. Parameters ---------- extension_name : str Returns ------- class """ return self._registry.get_extension_class(extension_name, ExtensionClassType.EXECUTION) def job_parameters_class(self, extension_name): """Return the class used for job parameters. Parameters ---------- extension_name : str Returns ------- class """ return self._registry.get_extension_class( extension_name, ExtensionClassType.PARAMETERS)
def test_registry__register_extensions(registry_fixture): registry = Registry(registry_filename=TEST_FILENAME) clear_extensions(registry) extension = DEFAULT_REGISTRY["extensions"][0] registry.register_extension(extension) extensions = registry.list_extensions() assert len(extensions) == 1 ext = extensions[0] assert ext["name"] == extension["name"] cfg_class = registry.get_extension_class(ext["name"], ExtensionClassType.CONFIGURATION) assert cfg_class == GenericCommandConfiguration exec_class = registry.get_extension_class(ext["name"], ExtensionClassType.EXECUTION) assert exec_class == GenericCommandExecution cli_mod = registry.get_extension_class(ext["name"], ExtensionClassType.CLI) assert cli_mod == cli # Test that the the changes are reflected with a new instance. registry2 = Registry(registry_filename=TEST_FILENAME) extensions1 = registry.list_extensions() extensions2 = registry2.list_extensions() for ext1, ext2 in zip(extensions1, extensions2): for field in DEFAULT_REGISTRY["extensions"][0]: assert ext1[field] == ext2[field]
def submit_jobs(self, cluster, force_local=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- cluster : Cluster force_local : bool If on HPC, run jobs through subprocess as if local. Returns ------- Status """ if self._is_new: logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) ResultsAggregator.create(self._output) # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission started", num_jobs=self.get_num_jobs(), ) log_event(event) os.environ["JADE_RUNTIME_OUTPUT"] = self._output if self._config.setup_command is not None: cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}" logger.info("Running setup command: %s", cmd) check_run_command(self._config.setup_command) else: self._handle_submission_groups() result = Status.IN_PROGRESS group = self._config.get_default_submission_group() groups = make_submission_group_lookup(cluster.config.submission_groups) self._hpc = HpcManager(groups, self._output) if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) num_processes = group.submitter_params.num_processes verbose = group.submitter_params.verbose result = runner.run_jobs(verbose=verbose, num_processes=num_processes) agg = ResultsAggregator.load(self._output) agg.process_results() is_complete = True else: is_complete = self._submit_to_hpc(cluster) if is_complete: result = self._handle_completion(cluster) return result
import os import sys import pytest from jade.extensions.registry import Registry if os.environ.get("LOCAL_SUBMITTER") is not None: print("You must unset the environment variable LOCAL_SUBMITTER.") sys.exit(1) registry = Registry() if not registry.is_registered("demo"): registry.register_demo_extension() @pytest.fixture def test_data_dir(): """The path to the directory that contains the fixture data""" return os.path.join(os.path.dirname(__file__), "data") @pytest.fixture def example_output(): return os.path.join(os.path.dirname(__file__), "data", "example_output")