def _update_completed_jobs(self): newly_completed = set() canceled_jobs = [] # If jobs fail and are configured to cancel blocked jobs, we may need to run this # loop many times to cancel the entire chain. aggregator = ResultsAggregator.load(self._output) need_to_rerun = True new_results = [] while need_to_rerun: need_to_rerun = False failed_jobs = set() for result in itertools.chain(aggregator.process_results(), new_results): newly_completed.add(result.name) if result.return_code != 0: failed_jobs.add(result.name) new_results.clear() logger.debug("Detected completion of jobs: %s", newly_completed) logger.debug("Detected failed jobs: %s", failed_jobs) for job in self._cluster.iter_jobs(state=JobState.NOT_SUBMITTED): if job.blocked_by: if job.cancel_on_blocking_job_failure and job.blocked_by.intersection( failed_jobs): result = self._cancel_job(job, aggregator) canceled_jobs.append(job) new_results.append(result) need_to_rerun = True else: job.blocked_by.difference_update(newly_completed) return newly_completed, canceled_jobs
def test_resubmit_missing(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 results.pop() agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = final_results["results"].pop() final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS - 1 ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def test_resubmit_failed(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 x = results[0] results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time) agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) final_results["results"][0]["return_code"] = 1 final_results["results_summary"]["num_failed"] = 1 final_results["results_summary"]["num_successful"] -= 1 dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert summary.get_failed_results()[0].name == "1" ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def submit_jobs(self, cluster, force_local=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- cluster : Cluster force_local : bool If on HPC, run jobs through subprocess as if local. Returns ------- Status """ if self._is_new: logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) ResultsAggregator.create(self._output) # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission started", num_jobs=self.get_num_jobs(), ) log_event(event) os.environ["JADE_RUNTIME_OUTPUT"] = self._output if self._config.setup_command is not None: cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}" logger.info("Running setup command: %s", cmd) check_run_command(self._config.setup_command) else: self._handle_submission_groups() result = Status.IN_PROGRESS group = self._config.get_default_submission_group() groups = make_submission_group_lookup(cluster.config.submission_groups) self._hpc = HpcManager(groups, self._output) if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) num_processes = group.submitter_params.num_processes verbose = group.submitter_params.verbose result = runner.run_jobs(verbose=verbose, num_processes=num_processes) agg = ResultsAggregator.load(self._output) agg.process_results() is_complete = True else: is_complete = self._submit_to_hpc(cluster) if is_complete: result = self._handle_completion(cluster) return result
def _reset_results(output, jobs_to_resubmit): aggregator = ResultsAggregator.load(output) aggregator.clear_results_for_resubmission(jobs_to_resubmit)
def test_resubmit_with_blocking_jobs(basic_setup): num_commands = 7 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) # Set an inefficient ordering to make sure the resubmit algorithm is recursive. for i, job_param in enumerate(jobs): if i == 3: job_param.blocked_by = set([5]) elif i == 4: job_param.blocked_by = set([7]) elif i == 6: job_param.blocked_by = set([6]) config.add_job(job_param) config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 found = False for i, result in enumerate(results): if result.name == "7": results.pop(i) found = True break assert found agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = None for i, result in enumerate(final_results["results"]): if result["name"] == "7": missing = result final_results["results"].pop(i) break assert missing is not None final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == num_commands - 1 first_batch = load_data(Path(OUTPUT) / "config_batch_1.json") assert len(first_batch["jobs"]) == num_commands ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == num_commands second_batch_file = Path(OUTPUT) / "config_batch_2.json" assert second_batch_file.exists() second_batch = load_data(second_batch_file)["jobs"] assert len(second_batch) == 3