def is_running(self): """Query the queue manager to get the state of the hindcast run. While the job is running, report its progress via a log message. If one or more "E R R O R" lines are found in the ocean.output file, cancel the job. If exactly one "E R R O R" line is found, assume that the run got "stuck" and handle it accordingly. :return: Flag indicating whether or not run is in RUNNING state :rtype: boolean """ if self._get_job_state() != "RUNNING": return False # Keep checking until we find a time.step file try: time_step_file = ssh_sftp.ssh_exec_command( self.ssh_client, f"cat {self.tmp_run_dir}/time.step", self.host_name, logger, ) except ssh_sftp.SSHCommandError: logger.info( f"{self.run_id} on {self.host_name}: time.step not found; continuing to watch..." ) return True self._report_progress(time_step_file) # grep ocean.output file for "E R R O R" lines try: ocean_output_errors = ssh_sftp.ssh_exec_command( self.ssh_client, f"grep 'E R R O R' {self.tmp_run_dir}/ocean.output", self.host_name, logger, ) except ssh_sftp.SSHCommandError: logger.error( f"{self.run_id} on {self.host_name}: ocean.output not found") return False error_lines = ocean_output_errors.splitlines() if not error_lines: return True # Cancel run if "E R R O R" in ocean.output logger.error( f"{self.run_id} on {self.host_name}: " f"found {len(error_lines)} 'E R R O R' line(s) in ocean.output") cmd = f"/opt/software/slurm/bin/scancel {self.job_id}" self._ssh_exec_command( cmd, f"{self.run_id} on {self.host_name}: cancelled {self.job_id}") if len(error_lines) != 1: # More than 1 "E R R O R" line mean the run failed irrevocably return False # Exactly 1 "E R R O R" line means the run is "stuck" and it can be re-queued self._handle_stuck_job() while self.is_queued(): time.sleep(60 * 5) self.get_tmp_run_dir() self.get_run_info() return True
def _launch_run(ssh_client, host_name, run_id, prev_job_id, config): """ :param :py:class:`paramiko.client.SSHClient` :param str host_name: :param str run_id: :param int or None prev_job_id: :param :py:class:`nemo_nowcast.Config` config: """ salishsea_cmd = config["run"]["hindcast hosts"][host_name]["salishsea cmd"] salishsea_exec = salishsea_cmd["executable"] run_options = salishsea_cmd["run options"] or "" run_envvars = salishsea_cmd["envvars"] or {} salishsea_prefix = ("; ".join( f"export {key}={value}" for key, value in run_envvars.items()) if run_envvars else "") salishsea_exec = (f"{salishsea_prefix}; {salishsea_exec}" if salishsea_prefix else salishsea_exec) run_prep_dir = Path( config["run"]["hindcast hosts"][host_name]["run prep dir"]) run_desc = run_prep_dir / f"{run_id}.yaml" scratch_dir = Path( config["run"]["hindcast hosts"][host_name]["scratch dir"]) results_dir = scratch_dir / run_id[:7] cmd = f"{salishsea_exec} run {run_desc} {results_dir} {run_options}" if prev_job_id: cmd = f"{cmd} --waitjob {prev_job_id} --nocheck-initial-conditions" try: ssh_sftp.ssh_exec_command(ssh_client, cmd, host_name, logger) except ssh_sftp.SSHCommandError as exc: for line in exc.stderr.splitlines(): logger.error(line) raise WorkerError logger.info(f"{run_id} run submitted to scheduler on {host_name}")
def is_running(self): """Query the queue manager to get the state of the hindcast run. While the job is running, report its progress via a log message. If one or more "E R R O R" lines are found in the ocean.output file, cancel the job. :return: Flag indicating whether or not run is in R state :rtype: boolean """ if self._get_job_state() != "R": return False # Keep checking until we find a time.step file try: time_step_file = ssh_sftp.ssh_exec_command( self.ssh_client, f"cat {self.tmp_run_dir}/time.step", self.host_name, logger, ) except ssh_sftp.SSHCommandError: logger.info( f"{self.run_id} on {self.host_name}: time.step not found; continuing to watch..." ) return True self._report_progress(time_step_file) # grep ocean.output file for "E R R O R" lines try: ocean_output_errors = ssh_sftp.ssh_exec_command( self.ssh_client, f"grep 'E R R O R' {self.tmp_run_dir}/ocean.output", self.host_name, logger, ) except ssh_sftp.SSHCommandError: logger.error( f"{self.run_id} on {self.host_name}: ocean.output not found") return False error_lines = ocean_output_errors.splitlines() if not error_lines: return True # Cancel run if "E R R O R" in ocean.output logger.error( f"{self.run_id} on {self.host_name}: " f"found {len(error_lines)} 'E R R O R' line(s) in ocean.output") cmd = f"/usr/bin/qdel {self.job_id}" self._ssh_exec_command( cmd, f"{self.run_id} on {self.host_name}: cancelled {self.job_id}") return False
def _ssh_exec_command(self, cmd, success_msg=""): """Execute cmd on the HPC host, returning its stdout. If cmd is successful, and success_msg is provided, log success_msg at the INFO level. If cmd fails, log stderr from the HPC host at the ERROR level, and raise WorkerError. :param str cmd: :param str success_msg: :raise: WorkerError :return: Standard output from the executed command. :rtype: str with newline separators """ try: stdout = ssh_sftp.ssh_exec_command(self.ssh_client, cmd, self.host_name, logger) if success_msg: logger.info(success_msg) return stdout except ssh_sftp.SSHCommandError as exc: for line in exc.stderr.splitlines(): logger.error(line) raise WorkerError
def _get_queue_info(ssh_client, host_name, job_id, ignore_unknown_job=False): """ :param :py:class:`paramiko.client.SSHClient` ssh_client: :param str host_name: :param str job_id: :param boolean ignore_unknown_job: :return: Output from TORQUE/MOAB qstat command that describes the run's state :rtype: str """ try: stdout = ssh_sftp.ssh_exec_command( ssh_client, f"/global/system/torque/bin/qstat -f -1 {job_id}", host_name, logger, ) except ssh_sftp.SSHCommandError as exc: if ignore_unknown_job: if exc.stderr == f"qstat: Unknown Job Id {job_id}.orca2.ibb\n": return "job_state = UNKNOWN\n" for line in exc.stderr.splitlines(): logger.error(line) raise WorkerError return stdout
def _get_prev_run_namelist_info(ssh_client, sftp_client, host_name, prev_run_date, config): """ :param :py:class:`paramiko.client.SSHClient` :param :py:class:`paramiko.sftp_client.SFTPClient` sftp_client: :param str host_name: :param :py:class:`arrow.Arrow` prev_run_date: :param :py:class:`nemo_nowcast.Config` config: :return: Namespace of run timing info: itend: last time step number rdt: time step in seconds :rtype: :py:class:`types.SimpleNamespace` """ scratch_dir = Path( config["run"]["hindcast hosts"][host_name]["scratch dir"]) dmy = prev_run_date.format("DDMMMYY").lower() stdout = ssh_sftp.ssh_exec_command( ssh_client, f"ls -d {scratch_dir/dmy}*/namelist_cfg", host_name, logger) prev_namelist_cfg = stdout.strip() logger.info( f"found previous run namelist: {host_name}:{prev_namelist_cfg}") with tempfile.NamedTemporaryFile("wt") as namelist_cfg: sftp_client.get(prev_namelist_cfg, namelist_cfg.name) namelist = f90nml.read(namelist_cfg.name) prev_namelist_info = SimpleNamespace( itend=namelist["namrun"]["nn_itend"], rdt=namelist["namdom"]["rn_rdt"]) return prev_namelist_info
def _launch_run(ssh_client, host_name, run_id, config): """ :param :py:class:`paramiko.client.SSHClient` :param str host_name: :param str run_id: :param :py:class:`nemo_nowcast.Config` config: :returns: Job id from TORQUE/MOAD resource manager :rtype: str """ salishsea_cmd = config["run"]["enabled hosts"][host_name]["salishsea cmd"] run_prep_dir = Path( config["run"]["enabled hosts"][host_name]["run prep dir"]) run_desc = run_prep_dir / f"{run_id}.yaml" scratch_dir = Path( config["run"]["enabled hosts"][host_name]["scratch dir"]) results_dir = scratch_dir / run_id[:7] cmd = f"{salishsea_cmd} run {run_desc} {results_dir} --debug" logger.debug(f"launching run on {host_name}: {cmd}") try: stdout = ssh_sftp.ssh_exec_command(ssh_client, cmd, host_name, logger) except ssh_sftp.SSHCommandError as exc: for line in exc.stderr.splitlines(): logger.error(line) raise WorkerError run_dir = stdout.splitlines()[-3].split()[-1] logger.debug(f"temporary run dir: {host_name}:{run_dir}") job_id = stdout.splitlines()[-2].split()[-1] logger.info(f"job id for {run_id}: {job_id}") return run_dir, job_id
def _get_tmp_run_dir(ssh_client, host_name, scratch_dir, run_id): """ :param :py:class:`paramiko.client.SSHClient` ssh_client: :param str host_name: :param :py:class:`pathlib.Path` scratch_dir: :param str run_id: :return: Temporary run directory :rtype: :py:class:`pathlib.Path` """ stdout = ssh_sftp.ssh_exec_command(ssh_client, f"ls -d {scratch_dir/run_id}_*", host_name, logger) tmp_run_dir = Path(stdout.splitlines()[0].strip()) logger.debug(f"found tmp run dir: {host_name}:{tmp_run_dir}") return tmp_run_dir
def _is_running(ssh_client, host_name, job_id, run_id, tmp_run_dir, run_info): """ :param :py:class:`paramiko.client.SSHClient` ssh_client: :param str host_name: :param str job_id: :param str run_id: :param :py:class:`pathlib.Path` tmp_run_dir: :param :py:class:`types.SimpleNamespace` run_info: :return: Flag indicating whether or not run is executing :rtype: boolean """ state = "UNKNOWN" queue_info = _get_queue_info(ssh_client, host_name, job_id, ignore_unknown_job=True) for line in queue_info.splitlines(): if line.strip().startswith("job_state"): state = line.split()[2] break if state != "R": return False try: stdout = ssh_sftp.ssh_exec_command(ssh_client, f"cat {tmp_run_dir}/time.step", host_name, logger) except ssh_sftp.SSHCommandError: # time.step file not found or empty; assume that run is young and it # hasn't been created yet, or has finished and it has been # moved to the results directory logger.info( f"{run_id} on {host_name}: time.step not found; continuing to watch..." ) return True time_step = int(stdout.splitlines()[0].strip()) model_seconds = (time_step - run_info.it000) * run_info.rdt model_time = run_info.date0.shift( seconds=model_seconds).format("YYYY-MM-DD HH:mm:ss UTC") fraction_done = (time_step - run_info.it000) / (run_info.itend - run_info.it000) logger.info(f"{run_id} on {host_name}: timestep: " f"{time_step} = {model_time}, {fraction_done:.1%} complete") return True
def _get_squeue_queue_info(ssh_client, host_name, queue_info_cmd, users): """ :param :py:class:`paramiko.client.SSHClient` :param str host_name: :param stgr queue_info_cmd: :param str users: :return: Lines from queue info cmd output showing job ids and run ids for users :rtype: list """ stdout = ssh_sftp.ssh_exec_command( ssh_client, f'{queue_info_cmd} --user {users} --Format "jobid,name" --sort=i', host_name, logger, ) if len(stdout.splitlines()) == 1: logger.error(f"no jobs found on {host_name} queue") raise WorkerError queue_info_lines = stdout.splitlines()[1:] queue_info_lines.reverse() return queue_info_lines
def _get_qstat_queue_info(ssh_client, host_name, queue_info_cmd, users): """ :param :py:class:`paramiko.client.SSHClient` :param str host_name: :param str queue_info_cmd: :param str users: :return: Lines from queue info cmd output showing job ids and run ids for users :rtype: list """ stdout = ssh_sftp.ssh_exec_command(ssh_client, f"{queue_info_cmd} -u {users}", host_name, logger) if len(stdout.splitlines()) == 5: logger.error(f"no jobs found on {host_name} queue") raise WorkerError queue_info_lines = stdout.splitlines()[5:] queue_info_lines.reverse() queue_info_lines = [ f"{line.split()[0].rsplit('.', 2)[0]} {line.split()[3]}" for line in queue_info_lines ] return queue_info_lines