def check_status(self, name=None, job_id=None): field_names = ("jobid", "name", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" if name is not None: cmd += f" -n {name}" elif job_id is not None: cmd += f" -j {job_id}" else: # Mutual exclusivity should be handled in HpcManager. assert False output = {} ret = run_command(cmd, output) if ret != 0: logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") stdout = output["stdout"] logger.debug("squeue output: [%s]", stdout) fields = stdout.split() if not fields: # No jobs are currently running. return HpcJobInfo("", "", HpcJobStatus.NONE) assert len(fields) == len(field_names) job_info = HpcJobInfo(fields[0], fields[1], self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN)) return job_info
def _submit_next_stage(self, stage_num, return_code=None): if return_code is None: assert stage_num == 1, str(stage_num) else: if stage_num != self.stage_num + 1: raise InvalidParameter( f"expected stage_num {self.stage_num + 1}, received {stage_num}" ) self._config.stages[stage_num - 2].return_code = return_code self._config.stage_num += 1 if self._config.stage_num == len(self._config.stages) + 1: logger.info("Pipeline is complete") self._config.is_complete = True self._serialize() return logger.info("Start execution pipeline stage %s/%s", stage_num, len(self._config.stages)) self._serialize() stage = self._config.stages[self.stage_num - 1] os.environ["JADE_PIPELINE_STAGE_ID"] = str(self.stage_num) self._run_auto_config(stage) output = self.get_stage_output_path(self.path, self.stage_num) ret = JobSubmitter.run_submit_jobs( stage.config_file, output, stage.submitter_params, pipeline_stage_num=self.stage_num, ) if ret != 0: raise ExecutionError(f"stage {self.stage_num} failed")
def get_successful_result(results_file, job_name): """Return the job result from the results file. Parameters ---------- results_file : str job_name : str Returns ------- dict Raises ------ InvalidParameter Raised if job_name is not found. ExecutionError Raised if the result was not successful. """ result = get_result(results_file, job_name) if result.return_code != 0 or result.status != "finished": raise ExecutionError(f"result was not successful: {result}") return result
def get_successful_result(results_file, job_name): """Return the job result from the results file. Parameters ---------- results_file : str job_name : str Returns ------- dict Raises ------ InvalidParameter Raised if job_name is not found. ExecutionError Raised if the result was not successful. """ result = get_result(results_file, job_name) if not result.is_successful(): raise ExecutionError(f"result was not successful: {result}") return result
def _submit(self, verbose): for stage in self._stages: os.environ["JADE_PIPELINE_STAGE_ID"] = str(self._cur_stage_id) stage_info = { "stage_id": self._cur_stage_id, "output_directory": self.get_stage_output_path(self._output, self._cur_stage_id) } self._status_info["stages"].append(stage_info) self._dump_status() self._run_auto_config(stage) cmd = self._make_submit_cmd(stage, verbose) start = time.time() ret = run_command(cmd) end = time.time() exec_time = end - start result = Result(str(self._cur_stage_id), ret, "finished", exec_time, end) self._status_info["stages"][-1]["result"] = serialize_result( result) self._dump_status() if ret != 0: raise ExecutionError(f"stage {self._cur_stage_id} failed") self._cur_stage_id += 1 logger.info("Finished execution pipeline")
def _run_auto_config(self, stage): if os.path.exists(stage.config_file): os.remove(stage.config_file) ret = run_command(stage.auto_config_cmd) if ret != 0: raise ExecutionError( f"Failed to auto-config stage {self.stage_num}: {ret}") if not os.path.exists(stage.config_file): raise ExecutionError( f"auto-config stage {self.stage_num} did not produce {stage.config_file}" ) final_file = self.get_stage_config_file_path(self._output, self.stage_num) shutil.copyfile(stage.config_file, final_file) stage.config_file = final_file
def _submit_request(self, cmd, *args): if not cmd.endswith("/"): cmd += "/" if args: cmd = cmd + "/".join(args) logger.info("Submitting %s", cmd) response = requests.get(cmd) if response.status_code != 200: raise ExecutionError(f"{cmd} failed: status_code={response.status_code}") return response.json()
def _run_auto_config(self, stage): config_file = stage["config_file"] if os.path.exists(config_file): os.remove(config_file) auto_config_cmd = stage["auto_config_cmd"] ret = run_command(auto_config_cmd) if ret != 0: raise ExecutionError( f"Failed to auto-config stage {self._cur_stage_id}: {ret}") if not os.path.exists(config_file): raise ExecutionError( f"auto-config stage {self._cur_stage_id} did not produce {config_file}" ) final_file = self.get_stage_config_file_path(self._output, self._cur_stage_id) shutil.move(config_file, final_file) stage["config_file"] = final_file
def check_run_command(*args, **kwargs): """Same as run_command except that it raises an exception on failure. Raises ------ ExecutionError Raised if the command returns a non-zero return code. """ ret = run_command(*args, **kwargs) if ret != 0: raise ExecutionError(f"command returned error code: {ret}")
def check_statuses(self): field_names = ("jobid", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" output = {} # Transient failures could be costly. Retry for up to one minute. ret = run_command(cmd, output, num_retries=6, retry_delay_s=10) if ret != 0: logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") return self._get_statuses_from_output(output["stdout"])
def _run_command(self, cmd): orig = os.getcwd() os.chdir(self._path) try: output = {} ret = run_command(cmd, output=output) if ret != 0: raise ExecutionError( f"[{cmd}] failed: {ret}: {output['stderr']}") return output["stdout"].strip() finally: os.chdir(orig)
def run(self): job_id, result = self._mgr.submit(self._output, self._name, self._run_script) self._is_pending = True if result != Status.GOOD: raise ExecutionError("Failed to submit name={self._name}") self._job_id = job_id event = StructuredLogEvent( source=self._name, category=EVENT_CATEGORY_HPC, name=EVENT_NAME_HPC_JOB_ASSIGNED, message="HPC job assigned", job_id=self._job_id, ) log_event(event) logger.info("Assigned job_ID=%s name=%s", self._job_id, self._name)
def check_status(self, name=None, job_id=None): field_names = ("jobid", "name", "state") cmd = f"squeue -u {self.USER} --Format \"{','.join(field_names)}\" -h" if name is not None: cmd += f" -n {name}" elif job_id is not None: cmd += f" -j {job_id}" else: # Mutual exclusivity should be handled in HpcManager. assert False output = {} # Transient failures could be costly. Retry for up to one minute. errors = ["Invalid job id specified"] ret = run_command(cmd, output, num_retries=6, retry_delay_s=10, error_strings=errors) if ret != 0: if "Invalid job id specified" in output["stderr"]: return HpcJobInfo("", "", HpcJobStatus.NONE) logger.error("Failed to run squeue command=[%s] ret=%s err=%s", cmd, ret, output["stderr"]) raise ExecutionError(f"squeue command failed: {ret}") stdout = output["stdout"] logger.debug("squeue output: [%s]", stdout) fields = stdout.split() if not fields: # No jobs are currently running. return HpcJobInfo("", "", HpcJobStatus.NONE) assert len(fields) == len(field_names) job_info = HpcJobInfo( fields[0], fields[1], self._STATUSES.get(fields[2], HpcJobStatus.UNKNOWN)) return job_info
def get_successful_result(self, job_name): """Return the successful job result from the results Parameters ---------- job_name : str Returns ------- dict Raises ------ InvalidParameter Raised if job_name is not found. ExecutionError Raised if the result was not successful. """ result = self.get_result(job_name) if result is None: raise InvalidParameter(f"result not found {job_name}") if result.return_code != 0 or result.status != "finished": raise ExecutionError(f"result wasn't successful: {result}") return result