Пример #1
0
    def is_running(self):
        """Query the queue manager to get the state of the hindcast run.

        While the job is running, report its progress via a log message.
        If one or more "E R R O R" lines are found in the ocean.output file,
        cancel the job.
        If exactly one "E R R O R" line is found, assume that the run got "stuck" and
        handle it accordingly.

        :return: Flag indicating whether or not run is in RUNNING state
        :rtype: boolean
        """
        if self._get_job_state() != "RUNNING":
            return False
        # Keep checking until we find a time.step file
        try:
            time_step_file = ssh_sftp.ssh_exec_command(
                self.ssh_client,
                f"cat {self.tmp_run_dir}/time.step",
                self.host_name,
                logger,
            )
        except ssh_sftp.SSHCommandError:
            logger.info(
                f"{self.run_id} on {self.host_name}: time.step not found; continuing to watch..."
            )
            return True
        self._report_progress(time_step_file)
        # grep ocean.output file for "E R R O R" lines
        try:
            ocean_output_errors = ssh_sftp.ssh_exec_command(
                self.ssh_client,
                f"grep 'E R R O R' {self.tmp_run_dir}/ocean.output",
                self.host_name,
                logger,
            )
        except ssh_sftp.SSHCommandError:
            logger.error(
                f"{self.run_id} on {self.host_name}: ocean.output not found")
            return False
        error_lines = ocean_output_errors.splitlines()
        if not error_lines:
            return True
        # Cancel run if "E R R O R" in ocean.output
        logger.error(
            f"{self.run_id} on {self.host_name}: "
            f"found {len(error_lines)} 'E R R O R' line(s) in ocean.output")
        cmd = f"/opt/software/slurm/bin/scancel {self.job_id}"
        self._ssh_exec_command(
            cmd, f"{self.run_id} on {self.host_name}: cancelled {self.job_id}")
        if len(error_lines) != 1:
            # More than 1 "E R R O R" line mean the run failed irrevocably
            return False
        # Exactly 1 "E R R O R" line means the run is "stuck" and it can be re-queued
        self._handle_stuck_job()
        while self.is_queued():
            time.sleep(60 * 5)
        self.get_tmp_run_dir()
        self.get_run_info()
        return True
Пример #2
0
def _launch_run(ssh_client, host_name, run_id, prev_job_id, config):
    """
    :param :py:class:`paramiko.client.SSHClient`
    :param str host_name:
    :param str run_id:
    :param int or None prev_job_id:
    :param :py:class:`nemo_nowcast.Config` config:
    """
    salishsea_cmd = config["run"]["hindcast hosts"][host_name]["salishsea cmd"]
    salishsea_exec = salishsea_cmd["executable"]
    run_options = salishsea_cmd["run options"] or ""
    run_envvars = salishsea_cmd["envvars"] or {}
    salishsea_prefix = ("; ".join(
        f"export {key}={value}"
        for key, value in run_envvars.items()) if run_envvars else "")
    salishsea_exec = (f"{salishsea_prefix}; {salishsea_exec}"
                      if salishsea_prefix else salishsea_exec)
    run_prep_dir = Path(
        config["run"]["hindcast hosts"][host_name]["run prep dir"])
    run_desc = run_prep_dir / f"{run_id}.yaml"
    scratch_dir = Path(
        config["run"]["hindcast hosts"][host_name]["scratch dir"])
    results_dir = scratch_dir / run_id[:7]
    cmd = f"{salishsea_exec} run {run_desc} {results_dir} {run_options}"
    if prev_job_id:
        cmd = f"{cmd} --waitjob {prev_job_id} --nocheck-initial-conditions"
    try:
        ssh_sftp.ssh_exec_command(ssh_client, cmd, host_name, logger)
    except ssh_sftp.SSHCommandError as exc:
        for line in exc.stderr.splitlines():
            logger.error(line)
        raise WorkerError
    logger.info(f"{run_id} run submitted to scheduler on {host_name}")
Пример #3
0
    def is_running(self):
        """Query the queue manager to get the state of the hindcast run.

        While the job is running, report its progress via a log message.
        If one or more "E R R O R" lines are found in the ocean.output file,
        cancel the job.

        :return: Flag indicating whether or not run is in R state
        :rtype: boolean
        """
        if self._get_job_state() != "R":
            return False
        # Keep checking until we find a time.step file
        try:
            time_step_file = ssh_sftp.ssh_exec_command(
                self.ssh_client,
                f"cat {self.tmp_run_dir}/time.step",
                self.host_name,
                logger,
            )
        except ssh_sftp.SSHCommandError:
            logger.info(
                f"{self.run_id} on {self.host_name}: time.step not found; continuing to watch..."
            )
            return True
        self._report_progress(time_step_file)
        # grep ocean.output file for "E R R O R" lines
        try:
            ocean_output_errors = ssh_sftp.ssh_exec_command(
                self.ssh_client,
                f"grep 'E R R O R' {self.tmp_run_dir}/ocean.output",
                self.host_name,
                logger,
            )
        except ssh_sftp.SSHCommandError:
            logger.error(
                f"{self.run_id} on {self.host_name}: ocean.output not found")
            return False
        error_lines = ocean_output_errors.splitlines()
        if not error_lines:
            return True
        # Cancel run if "E R R O R" in ocean.output
        logger.error(
            f"{self.run_id} on {self.host_name}: "
            f"found {len(error_lines)} 'E R R O R' line(s) in ocean.output")
        cmd = f"/usr/bin/qdel {self.job_id}"
        self._ssh_exec_command(
            cmd, f"{self.run_id} on {self.host_name}: cancelled {self.job_id}")
        return False
Пример #4
0
    def _ssh_exec_command(self, cmd, success_msg=""):
        """Execute cmd on the HPC host, returning its stdout.

        If cmd is successful, and success_msg is provided, log success_msg at the
        INFO level.

        If cmd fails, log stderr from the HPC host at the ERROR level, and raise
        WorkerError.

        :param str cmd:
        :param str success_msg:

        :raise: WorkerError

        :return: Standard output from the executed command.
        :rtype: str with newline separators
        """
        try:
            stdout = ssh_sftp.ssh_exec_command(self.ssh_client, cmd,
                                               self.host_name, logger)
            if success_msg:
                logger.info(success_msg)
            return stdout
        except ssh_sftp.SSHCommandError as exc:
            for line in exc.stderr.splitlines():
                logger.error(line)
            raise WorkerError
Пример #5
0
def _get_queue_info(ssh_client, host_name, job_id, ignore_unknown_job=False):
    """
    :param :py:class:`paramiko.client.SSHClient` ssh_client:
    :param str host_name:
    :param str job_id:
    :param boolean ignore_unknown_job:

    :return: Output from TORQUE/MOAB qstat command that describes the run's
             state
    :rtype: str
    """
    try:
        stdout = ssh_sftp.ssh_exec_command(
            ssh_client,
            f"/global/system/torque/bin/qstat -f -1 {job_id}",
            host_name,
            logger,
        )
    except ssh_sftp.SSHCommandError as exc:
        if ignore_unknown_job:
            if exc.stderr == f"qstat: Unknown Job Id {job_id}.orca2.ibb\n":
                return "job_state = UNKNOWN\n"
        for line in exc.stderr.splitlines():
            logger.error(line)
        raise WorkerError
    return stdout
Пример #6
0
def _get_prev_run_namelist_info(ssh_client, sftp_client, host_name,
                                prev_run_date, config):
    """
    :param :py:class:`paramiko.client.SSHClient`
    :param :py:class:`paramiko.sftp_client.SFTPClient` sftp_client:
    :param str host_name:
    :param :py:class:`arrow.Arrow` prev_run_date:
    :param :py:class:`nemo_nowcast.Config` config:

    :return: Namespace of run timing info:
               itend: last time step number
               rdt: time step in seconds
    :rtype: :py:class:`types.SimpleNamespace`
    """
    scratch_dir = Path(
        config["run"]["hindcast hosts"][host_name]["scratch dir"])
    dmy = prev_run_date.format("DDMMMYY").lower()
    stdout = ssh_sftp.ssh_exec_command(
        ssh_client, f"ls -d {scratch_dir/dmy}*/namelist_cfg", host_name,
        logger)
    prev_namelist_cfg = stdout.strip()
    logger.info(
        f"found previous run namelist: {host_name}:{prev_namelist_cfg}")
    with tempfile.NamedTemporaryFile("wt") as namelist_cfg:
        sftp_client.get(prev_namelist_cfg, namelist_cfg.name)
        namelist = f90nml.read(namelist_cfg.name)
        prev_namelist_info = SimpleNamespace(
            itend=namelist["namrun"]["nn_itend"],
            rdt=namelist["namdom"]["rn_rdt"])
    return prev_namelist_info
Пример #7
0
def _launch_run(ssh_client, host_name, run_id, config):
    """
    :param :py:class:`paramiko.client.SSHClient`
    :param str host_name:
    :param str run_id:
    :param :py:class:`nemo_nowcast.Config` config:

    :returns: Job id from TORQUE/MOAD resource manager
    :rtype: str
    """
    salishsea_cmd = config["run"]["enabled hosts"][host_name]["salishsea cmd"]
    run_prep_dir = Path(
        config["run"]["enabled hosts"][host_name]["run prep dir"])
    run_desc = run_prep_dir / f"{run_id}.yaml"
    scratch_dir = Path(
        config["run"]["enabled hosts"][host_name]["scratch dir"])
    results_dir = scratch_dir / run_id[:7]
    cmd = f"{salishsea_cmd} run {run_desc} {results_dir} --debug"
    logger.debug(f"launching run on {host_name}: {cmd}")
    try:
        stdout = ssh_sftp.ssh_exec_command(ssh_client, cmd, host_name, logger)
    except ssh_sftp.SSHCommandError as exc:
        for line in exc.stderr.splitlines():
            logger.error(line)
        raise WorkerError
    run_dir = stdout.splitlines()[-3].split()[-1]
    logger.debug(f"temporary run dir: {host_name}:{run_dir}")
    job_id = stdout.splitlines()[-2].split()[-1]
    logger.info(f"job id for {run_id}: {job_id}")
    return run_dir, job_id
Пример #8
0
def _get_tmp_run_dir(ssh_client, host_name, scratch_dir, run_id):
    """
    :param :py:class:`paramiko.client.SSHClient` ssh_client:
    :param str host_name:
    :param :py:class:`pathlib.Path` scratch_dir:
    :param str run_id:

    :return: Temporary run directory
    :rtype: :py:class:`pathlib.Path`
    """
    stdout = ssh_sftp.ssh_exec_command(ssh_client,
                                       f"ls -d {scratch_dir/run_id}_*",
                                       host_name, logger)
    tmp_run_dir = Path(stdout.splitlines()[0].strip())
    logger.debug(f"found tmp run dir: {host_name}:{tmp_run_dir}")
    return tmp_run_dir
Пример #9
0
def _is_running(ssh_client, host_name, job_id, run_id, tmp_run_dir, run_info):
    """
    :param :py:class:`paramiko.client.SSHClient` ssh_client:
    :param str host_name:
    :param str job_id:
    :param str run_id:
    :param :py:class:`pathlib.Path` tmp_run_dir:
    :param :py:class:`types.SimpleNamespace` run_info:

    :return: Flag indicating whether or not run is executing
    :rtype: boolean
    """
    state = "UNKNOWN"
    queue_info = _get_queue_info(ssh_client,
                                 host_name,
                                 job_id,
                                 ignore_unknown_job=True)
    for line in queue_info.splitlines():
        if line.strip().startswith("job_state"):
            state = line.split()[2]
            break
    if state != "R":
        return False
    try:
        stdout = ssh_sftp.ssh_exec_command(ssh_client,
                                           f"cat {tmp_run_dir}/time.step",
                                           host_name, logger)
    except ssh_sftp.SSHCommandError:
        # time.step file not found or empty; assume that run is young and it
        # hasn't been created yet, or has finished and it has been
        # moved to the results directory
        logger.info(
            f"{run_id} on {host_name}: time.step not found; continuing to watch..."
        )
        return True
    time_step = int(stdout.splitlines()[0].strip())
    model_seconds = (time_step - run_info.it000) * run_info.rdt
    model_time = run_info.date0.shift(
        seconds=model_seconds).format("YYYY-MM-DD HH:mm:ss UTC")
    fraction_done = (time_step - run_info.it000) / (run_info.itend -
                                                    run_info.it000)
    logger.info(f"{run_id} on {host_name}: timestep: "
                f"{time_step} = {model_time}, {fraction_done:.1%} complete")
    return True
Пример #10
0
def _get_squeue_queue_info(ssh_client, host_name, queue_info_cmd, users):
    """
    :param :py:class:`paramiko.client.SSHClient`
    :param str host_name:
    :param stgr queue_info_cmd:
    :param str users:

    :return: Lines from queue info cmd output showing job ids and run ids for users
    :rtype: list
    """
    stdout = ssh_sftp.ssh_exec_command(
        ssh_client,
        f'{queue_info_cmd} --user {users} --Format "jobid,name" --sort=i',
        host_name,
        logger,
    )
    if len(stdout.splitlines()) == 1:
        logger.error(f"no jobs found on {host_name} queue")
        raise WorkerError
    queue_info_lines = stdout.splitlines()[1:]
    queue_info_lines.reverse()
    return queue_info_lines
Пример #11
0
def _get_qstat_queue_info(ssh_client, host_name, queue_info_cmd, users):
    """
    :param :py:class:`paramiko.client.SSHClient`
    :param str host_name:
    :param str queue_info_cmd:
    :param str users:

    :return: Lines from queue info cmd output showing job ids and run ids for users
    :rtype: list
    """
    stdout = ssh_sftp.ssh_exec_command(ssh_client,
                                       f"{queue_info_cmd} -u {users}",
                                       host_name, logger)
    if len(stdout.splitlines()) == 5:
        logger.error(f"no jobs found on {host_name} queue")
        raise WorkerError
    queue_info_lines = stdout.splitlines()[5:]
    queue_info_lines.reverse()
    queue_info_lines = [
        f"{line.split()[0].rsplit('.', 2)[0]} {line.split()[3]}"
        for line in queue_info_lines
    ]
    return queue_info_lines