def cancel(self): self._return_code = 1 self._is_complete = True result = Result(self._job.name, self._return_code, JobCompletionStatus.CANCELED, 0.0) ResultsAggregator.append(self._output, result, batch_id=self._batch_id) logger.info("Canceled job %s", self._job.name)
def _complete(self): ret = self._pipe.returncode exec_time_s = time.time() - self._start_time job_filename = self._job.name illegal_chars = ("/", "\\", ":") for char in illegal_chars: job_filename = job_filename.replace(char, "-") status = "finished" output_dir = os.path.join(self._output, JOBS_OUTPUT_DIR, self._job.name) bytes_consumed = get_directory_size_bytes(output_dir) event = StructuredLogEvent( source=self._job.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="job output directory size", bytes_consumed=bytes_consumed, ) log_event(event) result = Result(self._job.name, ret, status, exec_time_s) ResultsAggregator.append(self._results_filename, result) logger.info("Job %s completed return_code=%s exec_time_s=%s", self._job.name, ret, exec_time_s)
def async_cmd(): """Async CLI command fixture""" job = mock.MagicMock() job.name = "Test-Job" cmd = "echo 'Hello World'" output = os.path.join(tempfile.gettempdir(), "jade-test-async-cli-job") os.makedirs(output, exist_ok=True) os.makedirs(os.path.join(output, RESULTS_DIR), exist_ok=True) ResultsAggregator.create(output) cmd = AsyncCliCommand(job, cmd, output, 1, True, "0") yield cmd shutil.rmtree(output)
def cancel(self): self._return_code = 1 self._is_complete = True if self._is_manager_node: result = Result( self._job.name, self._return_code, JobCompletionStatus.CANCELED, 0.0, hpc_job_id=self._hpc_job_id, ) ResultsAggregator.append(self._output, result, batch_id=self._batch_id) logger.info("Canceled job %s", self._job.name) else: logger.info("Canceled job %s on non-manager node", self._job.name)
def _update_completed_jobs(self): newly_completed = set() canceled_jobs = [] # If jobs fail and are configured to cancel blocked jobs, we may need to run this # loop many times to cancel the entire chain. aggregator = ResultsAggregator.load(self._output) need_to_rerun = True new_results = [] while need_to_rerun: need_to_rerun = False failed_jobs = set() for result in itertools.chain(aggregator.process_results(), new_results): newly_completed.add(result.name) if result.return_code != 0: failed_jobs.add(result.name) new_results.clear() logger.debug("Detected completion of jobs: %s", newly_completed) logger.debug("Detected failed jobs: %s", failed_jobs) for job in self._cluster.iter_jobs(state=JobState.NOT_SUBMITTED): if job.blocked_by: if job.cancel_on_blocking_job_failure and job.blocked_by.intersection( failed_jobs): result = self._cancel_job(job, aggregator) canceled_jobs.append(job) new_results.append(result) need_to_rerun = True else: job.blocked_by.difference_update(newly_completed) return newly_completed, canceled_jobs
def _generate_jobs(self, config_file, verbose): job_exec_class = self._config.job_execution_class() results_filename = get_results_temp_filename(self._output, self._batch_id) results_aggregator = ResultsAggregator(results_filename) results_aggregator.create_file() return [ DispatchableJob( job, job_exec_class.generate_command(job, self._jobs_output, config_file, verbose=verbose), self._output, results_filename) for job in self._config.iter_jobs() ]
def test_resubmit_missing(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 results.pop() agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = final_results["results"].pop() final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == NUM_COMMANDS - 1 ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def test_resubmit_failed(cleanup): cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 x = results[0] results[0] = Result(x.name, 1, x.status, x.exec_time_s, x.completion_time) agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) final_results["results"][0]["return_code"] = 1 final_results["results_summary"]["num_failed"] = 1 final_results["results_summary"]["num_successful"] -= 1 dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert summary.get_failed_results()[0].name == "1" ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == NUM_COMMANDS
def _handle_completion(self, cluster): result = Status.GOOD self._results = ResultsAggregator.list_results(self._output) if len(self._results) != self._config.get_num_jobs(): finished_jobs = {x.name for x in self._results} all_jobs = {x.name for x in self._config.iter_jobs()} missing_jobs = sorted(all_jobs.difference(finished_jobs)) logger.error( "Error in result totals. num_results=%s total_num_jobs=%s", len(self._results), self._config.get_num_jobs(), ) logger.error( "These jobs did not finish: %s. Check for process crashes or HPC timeouts.", missing_jobs, ) result = Status.ERROR else: missing_jobs = [] self.write_results_summary(RESULTS_FILE, missing_jobs) self._log_error_log_messages(self._output) bytes_consumed = get_directory_size_bytes(self._output, recursive=False) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="main output directory size", bytes_consumed=bytes_consumed, ) log_event(event) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission completed", num_jobs=self.get_num_jobs(), ) log_event(event) group = self._config.get_default_submission_group() if group.submitter_params.generate_reports: self.generate_reports(self._output, group.submitter_params.resource_monitor_type) cluster.mark_complete() if cluster.config.pipeline_stage_num is not None: # The pipeline directory must be the one above this one. pipeline_dir = os.path.dirname(self._output) next_stage = cluster.config.pipeline_stage_num + 1 cmd = (f"jade pipeline submit-next-stage {pipeline_dir} " f"--stage-num={next_stage} " f"--return-code={result.value}") run_command(cmd) return result
def test_results_aggregator(cleanup): """Test ResultsAggregator""" if os.path.exists(OUTPUT): shutil.rmtree(OUTPUT) os.makedirs(os.path.join(OUTPUT, RESULTS_DIR)) results = [create_result(i) for i in range(100)] batch1_file = get_results_temp_filename(OUTPUT, 1) batch2_file = get_results_temp_filename(OUTPUT, 2) pytest.aggregator1 = ResultsAggregator(batch1_file) pytest.aggregator2 = ResultsAggregator(batch2_file) pytest.aggregator1.create_file() pytest.aggregator2.create_file() assert os.path.exists(pytest.aggregator1._filename) assert os.path.exists(pytest.aggregator2._filename) with ProcessPoolExecutor() as executor: executor.map(append, results) final_results1 = pytest.aggregator1.get_results() final_results1.sort(key=lambda x: int(x.name)) final_results2 = pytest.aggregator2.get_results() final_results2.sort(key=lambda x: int(x.name)) expected1 = [x for x in results if int(x.name) % 2 == 0] expected2 = [x for x in results if int(x.name) % 2 != 0] assert final_results1 == expected1 assert final_results2 == expected2 results_dir = os.path.join(OUTPUT, RESULTS_DIR) summary = ResultsAggregatorSummary(results_dir) final_results = summary.get_results() final_results.sort(key=lambda x: int(x.name)) assert final_results == results summary.delete_files() assert not [x for x in os.listdir(results_dir) if x.endswith(".csv")]
def _complete(self): self._return_code = self._pipe.returncode exec_time_s = time.time() - self._start_time if not self._is_manager_node: # This will happen on a multi-node job. Don't complete it multiple times. logger.info( "Job %s completed on non-manager node return_code=%s exec_time_s=%s", self._job.name, self._return_code, exec_time_s, ) return status = JobCompletionStatus.FINISHED output_dir = self._output / JOBS_OUTPUT_DIR / self._job.name bytes_consumed = get_directory_size_bytes(output_dir) event = StructuredLogEvent( source=self._job.name, category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_BYTES_CONSUMED, message="job output directory size", bytes_consumed=bytes_consumed, ) log_event(event) result = Result( self._job.name, self._return_code, status, exec_time_s, hpc_job_id=self._hpc_job_id ) ResultsAggregator.append(self._output, result, batch_id=self._batch_id) logger.info( "Job %s completed return_code=%s exec_time_s=%s hpc_job_id=%s", self._job.name, self._return_code, exec_time_s, self._hpc_job_id, )
def test_results_aggregator(cleanup): """Test ResultsAggregator""" if os.path.exists(OUTPUT): shutil.rmtree(OUTPUT) results = [create_result(i) for i in range(100)] os.makedirs(OUTPUT) pytest.aggregator = ResultsAggregator.create(OUTPUT) assert os.path.exists(pytest.aggregator._filename) for result in results: if int(result.name) % 2 == 0: pytest.aggregator.append_result(result) final_results = pytest.aggregator.get_results() final_results.sort(key=lambda x: int(x.name)) expected = [x for x in results if int(x.name) % 2 == 0] assert final_results == expected
def submit_jobs(self, cluster, force_local=False): """Submit simulations. Auto-detect whether the current system is an HPC and submit to its queue. Otherwise, run locally. Parameters ---------- cluster : Cluster force_local : bool If on HPC, run jobs through subprocess as if local. Returns ------- Status """ if self._is_new: logger.info("Submit %s jobs for execution.", self._config.get_num_jobs()) logger.info("JADE version %s", jade.version.__version__) registry = Registry() loggers = registry.list_loggers() logger.info("Registered modules for logging: %s", ", ".join(loggers)) self._save_repository_info(registry) ResultsAggregator.create(self._output) # If an events summary file exists, it is invalid. events_file = os.path.join(self._output, EVENTS_FILENAME) if os.path.exists(events_file): os.remove(events_file) event = StructuredLogEvent( source="submitter", category=EVENT_CATEGORY_RESOURCE_UTIL, name=EVENT_NAME_SUBMIT_COMPLETED, message="job submission started", num_jobs=self.get_num_jobs(), ) log_event(event) os.environ["JADE_RUNTIME_OUTPUT"] = self._output if self._config.setup_command is not None: cmd = f"JADE_RUNTIME_OUTPUT={self._output} {self._config.setup_command}" logger.info("Running setup command: %s", cmd) check_run_command(self._config.setup_command) else: self._handle_submission_groups() result = Status.IN_PROGRESS group = self._config.get_default_submission_group() groups = make_submission_group_lookup(cluster.config.submission_groups) self._hpc = HpcManager(groups, self._output) if self._hpc.hpc_type == HpcType.LOCAL or force_local: runner = JobRunner(self._config_file, output=self._output) num_processes = group.submitter_params.num_processes verbose = group.submitter_params.verbose result = runner.run_jobs(verbose=verbose, num_processes=num_processes) agg = ResultsAggregator.load(self._output) agg.process_results() is_complete = True else: is_complete = self._submit_to_hpc(cluster) if is_complete: result = self._handle_completion(cluster) return result
def _reset_results(output, jobs_to_resubmit): aggregator = ResultsAggregator.load(output) aggregator.clear_results_for_resubmission(jobs_to_resubmit)
def test_resubmit_with_blocking_jobs(basic_setup): num_commands = 7 commands = ['echo "hello world"'] * num_commands with open(TEST_FILENAME, "w") as f_out: for command in commands: f_out.write(command + "\n") inputs = GenericCommandInputs(TEST_FILENAME) config = GenericCommandConfiguration(job_inputs=inputs) jobs = list(inputs.iter_jobs()) # Set an inefficient ordering to make sure the resubmit algorithm is recursive. for i, job_param in enumerate(jobs): if i == 3: job_param.blocked_by = set([5]) elif i == 4: job_param.blocked_by = set([7]) elif i == 6: job_param.blocked_by = set([6]) config.add_job(job_param) config.dump(CONFIG_FILE) cmd = f"{SUBMIT_JOBS} {CONFIG_FILE} --output={OUTPUT}" ret = run_command(cmd) assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 agg = ResultsAggregator.load(OUTPUT) results = agg.get_results_unsafe() assert results for result in results: assert result.return_code == 0 found = False for i, result in enumerate(results): if result.name == "7": results.pop(i) found = True break assert found agg._write_results(results) results_filename = os.path.join(OUTPUT, RESULTS_FILE) final_results = load_data(results_filename) missing = None for i, result in enumerate(final_results["results"]): if result["name"] == "7": missing = result final_results["results"].pop(i) break assert missing is not None final_results["results_summary"]["num_missing"] = 1 final_results["results_summary"]["num_successful"] -= 1 final_results["missing_jobs"] = [missing["name"]] dump_data(final_results, results_filename) summary = ResultsSummary(OUTPUT) assert len(summary.get_failed_results()) == 0 assert len(summary.get_successful_results()) == num_commands - 1 first_batch = load_data(Path(OUTPUT) / "config_batch_1.json") assert len(first_batch["jobs"]) == num_commands ret = run_command(f"{RESUBMIT_JOBS} {OUTPUT}") assert ret == 0 ret = run_command(f"{WAIT} --output={OUTPUT} -p 0.01") assert ret == 0 summary = ResultsSummary(OUTPUT) assert len(summary.get_successful_results()) == num_commands second_batch_file = Path(OUTPUT) / "config_batch_2.json" assert second_batch_file.exists() second_batch = load_data(second_batch_file)["jobs"] assert len(second_batch) == 3