def test_job_submitter(self): config = get_config() config.dump(CONFIG_FILE) try: mgr = JobSubmitter(CONFIG_FILE) assert not mgr.get_completed_results() assert config.get_num_jobs() == mgr.get_num_jobs() finally: os.remove(CONFIG_FILE)
def submit_jobs(config_file, per_node_batch_size, hpc_config, local, max_nodes, output, poll_interval, num_processes, rotate_logs, verbose, restart_failed, restart_missing, reports, try_add_blocked_jobs): """Submits jobs for execution, locally or on HPC.""" os.makedirs(output, exist_ok=True) previous_results = [] if restart_failed: failed_job_config = create_config_from_previous_run( config_file, output, result_type='failed') previous_results = ResultsSummary(output).get_successful_results() config_file = "failed_job_inputs.json" failed_job_config.dump(config_file) if restart_missing: missing_job_config = create_config_from_previous_run( config_file, output, result_type='missing') config_file = "missing_job_inputs.json" missing_job_config.dump(config_file) previous_results = ResultsSummary(output).list_results() if rotate_logs: rotate_filenames(output, ".log") filename = os.path.join(output, "submit_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level) logger.info(get_cli_string()) event_file = os.path.join(output, "submit_jobs_events.log") # This effectively means no console logging. setup_logging("event", event_file, console_level=logging.ERROR, file_level=logging.INFO) mgr = JobSubmitter(config_file, hpc_config=hpc_config, output=output) ret = mgr.submit_jobs( per_node_batch_size=per_node_batch_size, max_nodes=max_nodes, force_local=local, verbose=verbose, num_processes=num_processes, poll_interval=poll_interval, previous_results=previous_results, reports=reports, try_add_blocked_jobs=try_add_blocked_jobs, ) sys.exit(ret.value)
def _submit_next_stage(self, stage_num, return_code=None): if return_code is None: assert stage_num == 1, str(stage_num) else: if stage_num != self.stage_num + 1: raise InvalidParameter( f"expected stage_num {self.stage_num + 1}, received {stage_num}" ) self._config.stages[stage_num - 2].return_code = return_code self._config.stage_num += 1 if self._config.stage_num == len(self._config.stages) + 1: logger.info("Pipeline is complete") self._config.is_complete = True self._serialize() return logger.info("Start execution pipeline stage %s/%s", stage_num, len(self._config.stages)) self._serialize() stage = self._config.stages[self.stage_num - 1] os.environ["JADE_PIPELINE_STAGE_ID"] = str(self.stage_num) self._run_auto_config(stage) output = self.get_stage_output_path(self.path, self.stage_num) ret = JobSubmitter.run_submit_jobs( stage.config_file, output, stage.submitter_params, pipeline_stage_num=self.stage_num, ) if ret != 0: raise ExecutionError(f"stage {self.stage_num} failed")
def cancel_jobs(output, verbose): """Cancels jobs.""" filename = os.path.join(output, "cancel_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") logger.info(get_cli_string()) for _ in range(60): cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not promoted: logger.info("Did not get promoted. Sleep.") time.sleep(1) continue if cluster.is_complete(): cluster.demote_from_submitter() logger.info("All jobs are already finished.") sys.exit(0) submitter = JobSubmitter.load(output) submitter.cancel_jobs(cluster) sys.exit(0) logger.error("Failed to get promoted to submitter.") sys.exit(1)
def test_jobs_submitter__find_error_log_messages(example_output): events = list(JobSubmitter.find_error_log_messages(example_output)) assert len(events) == 4 assert events[0].data["error"] == "Traceback" assert events[0].data[ "filename"] == f"{example_output}/job_output_2741821.e" assert events[0].data["line_number"] == 2 assert events[0].data["text"] == "Traceback (most recent call last):" assert events[1].data["error"] == "DUE TO TIME LIMIT" assert events[1].data["line_number"] == 43 assert events[2].data["error"] == "slurmstepd" assert events[2].data["line_number"] == 44 assert events[3].data["error"] == "srun" assert events[3].data["line_number"] == 45
def resubmit_jobs(output, failed, missing, verbose): """Resubmit failed and missing jobs.""" event_file = os.path.join(output, "submit_jobs_events.log") setup_event_logging(event_file, mode="a") filename = os.path.join(output, "submit_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not cluster.is_complete(): cluster.demote_from_submitter() print( "resubmit-jobs requires that the existing submission be complete", file=sys.stderr) sys.exit(1) assert promoted jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing) updated_blocking_jobs_by_name = _update_with_blocking_jobs( jobs_to_resubmit, output) _reset_results(output, jobs_to_resubmit) cluster.prepare_for_resubmission(jobs_to_resubmit, updated_blocking_jobs_by_name) ret = 1 try: mgr = JobSubmitter.load(output) status = mgr.submit_jobs(cluster) if status == Status.IN_PROGRESS: print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}") ret = 0 else: ret = status.value except Exception: logger.exception("Failed to resubmit jobs") raise finally: cluster.demote_from_submitter() sys.exit(ret)
def resubmit_jobs(output, failed, missing, successful, submission_groups_file, verbose): """Resubmit jobs.""" event_file = os.path.join(output, "submit_jobs_events.log") setup_event_logging(event_file, mode="a") filename = os.path.join(output, "submit_jobs.log") level = logging.DEBUG if verbose else logging.INFO setup_logging(__name__, filename, file_level=level, console_level=level, mode="a") cluster, promoted = Cluster.deserialize( output, try_promote_to_submitter=True, deserialize_jobs=True, ) if not cluster.is_complete(): cluster.demote_from_submitter() print("resubmit-jobs requires that the existing submission be complete", file=sys.stderr) sys.exit(1) assert promoted if submission_groups_file is not None: groups = load_data(submission_groups_file) cur = len(groups) orig = len(cluster.config.submission_groups) if cur != orig: print( f"Length of submission_groups ({cur}) must be identical to the original ({orig})", file=sys.stderr, ) cluster.demote_from_submitter() sys.exit(1) for _group in groups: group = SubmissionGroup(**_group) found = False for i, orig_group in enumerate(cluster.config.submission_groups): if group.name == orig_group.name: cluster.config.submission_groups[i] = group found = True break if not found: print( f"submission group {group.name} does not exist in the original", file=sys.stderr, ) cluster.demote_from_submitter() sys.exit(1) logger.info("Updated submitter parameters from %s", submission_groups_file) jobs_to_resubmit = _get_jobs_to_resubmit(cluster, output, failed, missing, successful) updated_blocking_jobs_by_name = _update_with_blocking_jobs(jobs_to_resubmit, output) _reset_results(output, jobs_to_resubmit) cluster.prepare_for_resubmission(jobs_to_resubmit, updated_blocking_jobs_by_name) ret = 1 try: mgr = JobSubmitter.load(output) status = mgr.submit_jobs(cluster) if status == Status.IN_PROGRESS: print(f"Resubmitted {len(jobs_to_resubmit)} jobs in {output}") ret = 0 else: ret = status.value except Exception: logger.exception("Failed to resubmit jobs") raise finally: cluster.demote_from_submitter() sys.exit(ret)
def submit_jobs( config_file=None, per_node_batch_size=None, dry_run=None, force=None, hpc_config=None, local=None, max_nodes=None, output=None, poll_interval=None, resource_monitor_interval=None, resource_monitor_type=None, num_processes=None, verbose=None, reports=None, enable_singularity=None, container=None, try_add_blocked_jobs=None, time_based_batching=None, node_setup_script=None, node_shutdown_script=None, submitter_params=None, no_distributed_submitter=None, ): """Submits jobs for execution, locally or on HPC.""" if os.path.exists(output): if force: shutil.rmtree(output) else: print( f"{output} already exists. Delete it or use '--force' to overwrite.", file=sys.stderr, ) sys.exit(1) if submitter_params is not None: params = SubmitterParams(**load_data(submitter_params)) else: params = make_submitter_params( per_node_batch_size=per_node_batch_size, dry_run=dry_run, hpc_config=hpc_config, local=local, max_nodes=max_nodes, poll_interval=poll_interval, resource_monitor_interval=resource_monitor_interval, resource_monitor_type=resource_monitor_type, num_processes=num_processes, verbose=verbose, reports=reports, enable_singularity=enable_singularity, container=container, try_add_blocked_jobs=try_add_blocked_jobs, time_based_batching=time_based_batching, node_setup_script=node_setup_script, node_shutdown_script=node_shutdown_script, no_distributed_submitter=no_distributed_submitter, ) if params.time_based_batching and params.num_processes is None: print("Error: num_processes must be set with time-based batching", file=sys.stderr) sys.exit(1) os.makedirs(output) filename = os.path.join(output, "submit_jobs.log") event_filename = os.path.join(output, "submit_jobs_events.log") level = logging.DEBUG if verbose else logging.INFO # For some reason event logging must be setup before general logging. # Otherwise, the first event doesn't show up in the log. setup_event_logging(event_filename) logger = setup_logging(__name__, filename, file_level=level, console_level=level, mode="w") logger.info(get_cli_string()) try: ret = JobSubmitter.run_submit_jobs(config_file, output, params) sys.exit(ret) except Exception: logger.exception("Failed to run submit_jobs") raise
def test_jobs_submitter__generate_reports(example_output, cleanup): ret = JobSubmitter.generate_reports(example_output, True) assert ret == 0 for filename in ("errors.txt", "results.txt", "stats.txt"): path = os.path.join(example_output, filename) assert os.path.exists(path)