Exemplo n.º 1
0
def test_get_host_from_platform_fails_bad_method():
    platform = TEST_PLATFORM.copy()
    platform['selection']['method'] = 'roulette'
    with pytest.raises(CylcError) as err:
        get_host_from_platform(platform, {'Elephant'})
    assert err.exconly() == (
        'cylc.flow.exceptions.CylcError: method "roulette" is not a '
        'supported host selection method.')
Exemplo n.º 2
0
    def get_task_auth(self, suite_name: str,
                      task_name: str) -> Union[str, None]:
        """Get host for a remote task from a Cylc workflow definition.

        Returns: Hostname or None if:
          - task does not run remotely.
          - task has not been defined.
        """
        # n.b. Imports inside function to avoid dependency on Cylc and
        # Cylc-Rose is Rose is being used with a different workflow engine.
        from cylc.flow.platforms import get_host_from_platform
        from cylc.flow.hostuserutil import is_remote_platform
        from cylc.rose.platform_utils import get_platform_from_task_def

        # Check whether task has been defined.
        try:
            platform = get_platform_from_task_def(suite_name, task_name)
        except KeyError:
            return None
        else:
            # If task has been defined return host:
            if is_remote_platform(platform):
                return get_host_from_platform(platform)
            else:
                return None
Exemplo n.º 3
0
    def _setup_job_logs_retrieval(self, itask, event):
        """Set up remote job logs retrieval.

        For a task with a job completion event, i.e. succeeded, failed,
        (execution) retry.
        """
        id_key = ((self.HANDLER_JOB_LOGS_RETRIEVE, event), str(itask.point),
                  itask.tdef.name, itask.submit_num)
        events = (self.EVENT_FAILED, self.EVENT_RETRY, self.EVENT_SUCCEEDED)
        host = get_host_from_platform(itask.platform)
        if (event not in events or not is_remote_host(host)
                or not self.get_host_conf(itask, "retrieve job logs")
                or id_key in self.event_timers):
            return
        retry_delays = self.get_host_conf(itask,
                                          "retrieve job logs retry delays")
        if not retry_delays:
            retry_delays = [0]
        self.event_timers[id_key] = TaskActionTimer(
            TaskJobLogsRetrieveContext(
                self.HANDLER_JOB_LOGS_RETRIEVE,  # key
                self.HANDLER_JOB_LOGS_RETRIEVE,  # ctx_type
                itask.platform['name'],
                self.get_host_conf(itask, "retrieve job logs max size"),
            ),
            retry_delays)
Exemplo n.º 4
0
    def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys):
        """Process retrieval of task job logs from remote user@host."""
        platform = get_platform(ctx.platform_n)
        ssh_str = str(platform["ssh command"])
        rsync_str = str(platform["retrieve job logs command"])

        cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str]
        if LOG.isEnabledFor(DEBUG):
            cmd.append("-v")
        if ctx.max_size:
            cmd.append("--max-size=%s" % (ctx.max_size, ))
        # Includes and excludes
        includes = set()
        for _, point, name, submit_num in id_keys:
            # Include relevant directories, all levels needed
            includes.add("/%s" % (point))
            includes.add("/%s/%s" % (point, name))
            includes.add("/%s/%s/%02d" % (point, name, submit_num))
            includes.add("/%s/%s/%02d/**" % (point, name, submit_num))
        cmd += ["--include=%s" % (include) for include in sorted(includes)]
        cmd.append("--exclude=/**")  # exclude everything else
        # Remote source
        cmd.append("%s:%s/" %
                   (get_host_from_platform(platform),
                    get_remote_suite_run_job_dir(platform, schd_ctx.suite)))
        # Local target
        cmd.append(get_suite_run_job_dir(schd_ctx.suite) + "/")
        self.proc_pool.put_command(
            SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys),
            self._job_logs_retrieval_callback, [schd_ctx])
Exemplo n.º 5
0
    def get_suite_jobs_auths(
            self,
            suite_name: str,
            cycle_name_tuples: Tuple[Any] = None) -> List[str]:
        """Get hosts of jobs from a Cylc workflow database.

        returns: list of hostname strings.
        """
        # n.b. Imports inside function to avoid dependency on Cylc and
        # Cylc-Rose is Rose is being used with a different workflow engine.
        from cylc.flow.platforms import get_host_from_platform
        from cylc.rose.platform_utils import get_platforms_from_task_jobs

        task_platforms = {}
        if cycle_name_tuples is not None:
            for cycle, name in cycle_name_tuples:
                new_platforms = get_platforms_from_task_jobs(suite_name, cycle)
                task_platforms[cycle] = new_platforms

        # For each platform get a list of hosts.
        hosts = []
        for cycle, tasks in task_platforms.items():
            for platform in tasks.values():
                hosts.append(get_host_from_platform(platform))
        hosts = list(set(hosts))
        return hosts
Exemplo n.º 6
0
def main(_, options: 'Values', *ids) -> None:
    workflow_id, _, flow_file = parse_id(
        *ids,
        src=True,
        constraint='workflows',
    )

    # extract task host platforms from the workflow_id
    config = WorkflowConfig(
        workflow_id, flow_file, options,
        load_template_vars(options.templatevars, options.templatevars_file))

    platforms = {
        config.get_config(['runtime', name, 'platform'])
        for name in config.get_namespace_list('all tasks')
    } - {None, 'localhost'}

    # When "workflow run hosts" are formalised as "flow platforms"
    # we can substitute `localhost` for this, in the mean time
    # we will have to assume that flow hosts are configured correctly.

    if not platforms:
        sys.exit(0)

    verbose = cylc.flow.flags.verbosity > 0

    # get the cylc version on each platform
    versions = {}
    for platform_name in sorted(platforms):
        platform = get_platform(platform_name)
        host = get_host_from_platform(platform, bad_hosts=None)
        cmd = construct_ssh_cmd(['version'], platform, host)
        if verbose:
            print(cmd)
        proc = procopen(cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE)
        out, err = proc.communicate()
        out = out.decode()
        err = err.decode()
        if proc.wait() == 0:
            if verbose:
                print("   %s" % out)
            versions[platform_name] = out.strip()
        else:
            versions[platform_name] = f'ERROR: {err.strip()}'

    # report results
    max_len = max((len(platform_name) for platform_name in platforms))
    print(f'{"platform".rjust(max_len)}: cylc version')
    print('-' * (max_len + 14))
    for platform_name, result in versions.items():
        print(f'{platform_name.rjust(max_len)}: {result}')
    if all((version == CYLC_VERSION for version in versions.values())):
        ret_code = 0
    elif options.error:
        ret_code = 1
    else:
        ret_code = 0
    sys.exit(ret_code)
Exemplo n.º 7
0
 def construct_remote_tidy_ssh_cmd(install_target, platform):
     cmd = ['remote-tidy']
     if cylc.flow.flags.verbosity > 1:
         cmd.append('--debug')
     cmd.append(install_target)
     cmd.append(get_remote_workflow_run_dir(self.workflow))
     host = get_host_from_platform(platform, bad_hosts=self.bad_hosts)
     cmd = construct_ssh_cmd(cmd, platform, host, timeout='10s')
     return cmd, host
Exemplo n.º 8
0
 def insert_db_job(self, row_idx, row):
     """Load job element from DB post restart."""
     if row_idx == 0:
         LOG.info("LOADING job data")
     (point_string, name, status, submit_num, time_submit, time_run,
      time_run_exit, batch_sys_name, batch_sys_job_id, platform_name) = row
     if status not in JOB_STATUS_SET:
         return
     t_id = f'{self.workflow_id}{ID_DELIM}{point_string}{ID_DELIM}{name}'
     j_id = f'{t_id}{ID_DELIM}{submit_num}'
     try:
         tdef = self.schd.config.get_taskdef(name)
         j_owner = self.schd.owner
         if platform_name:
             j_host = get_host_from_platform(get_platform(platform_name))
         else:
             j_host = self.schd.host
         j_buf = PbJob(
             stamp=f'{j_id}@{time()}',
             id=j_id,
             submit_num=submit_num,
             state=status,
             task_proxy=t_id,
             submitted_time=time_submit,
             started_time=time_run,
             finished_time=time_run_exit,
             batch_sys_name=batch_sys_name,
             batch_sys_job_id=batch_sys_job_id,
             host=j_host,
             owner=j_owner,
             name=name,
             cycle_point=point_string,
         )
         # Add in log files.
         j_buf.job_log_dir = get_task_job_log(self.schd.suite, point_string,
                                              name, submit_num)
         overrides = self.schd.task_events_mgr.broadcast_mgr.get_broadcast(
             TaskID.get(name, point_string))
         if overrides:
             rtconfig = pdeepcopy(tdef.rtconfig)
             poverride(rtconfig, overrides, prepend=True)
         else:
             rtconfig = tdef.rtconfig
         j_buf.extra_logs.extend([
             os.path.expanduser(os.path.expandvars(log_file))
             for log_file in rtconfig['extra log files']
         ])
     except SuiteConfigError:
         LOG.exception(
             ('ignoring job %s from the suite run database\n'
              '(its task definition has probably been deleted).') % j_id)
     except Exception:
         LOG.exception('could not load job %s' % j_id)
     else:
         self.added[j_id] = j_buf
         self.task_jobs.setdefault(t_id, set()).add(j_id)
         self.updates_pending = True
Exemplo n.º 9
0
 def construct_remote_tidy_ssh_cmd(
         platform: Dict[str, Any]) -> Tuple[List[str], str]:
     cmd = ['remote-tidy']
     cmd.extend(verbosity_to_opts(cylc.flow.flags.verbosity))
     cmd.append(get_install_target_from_platform(platform))
     cmd.append(get_remote_workflow_run_dir(self.workflow))
     host = get_host_from_platform(platform, bad_hosts=self.bad_hosts)
     cmd = construct_ssh_cmd(cmd, platform, host, timeout='10s')
     return cmd, host
Exemplo n.º 10
0
    def remote_tidy(self):
        """Remove suite contact files and keys from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.
        """
        # Issue all SSH commands in parallel
        procs = {}
        for platform, init_with_contact in self.remote_init_map.items():
            platform = get_platform(platform)
            host = get_host_from_platform(platform)
            owner = platform['owner']
            self.install_target = get_install_target_from_platform(platform)
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['remote-tidy']
            if cylc.flow.flags.debug:
                cmd.append('--debug')
            cmd.append(str(f'{self.install_target}'))
            cmd.append(get_remote_suite_run_dir(platform, self.suite))
            if is_remote_platform(platform):
                cmd = construct_platform_ssh_cmd(cmd, platform, timeout='10s')
            else:
                cmd = ['cylc'] + cmd
            procs[(host, owner)] = (
                cmd,
                Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=DEVNULL))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = (f.decode() for f in proc.communicate())
                if proc.wait():
                    LOG.warning(TaskRemoteMgmtError(
                        TaskRemoteMgmtError.MSG_TIDY,
                        (host, owner), ' '.join(quote(item) for item in cmd),
                        proc.returncode, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(TaskRemoteMgmtError(
                    TaskRemoteMgmtError.MSG_TIDY,
                    (host, owner), ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err))
Exemplo n.º 11
0
def remote_cylc_cmd(cmd, platform, **kwargs):
    """Execute a Cylc command on a remote platform.

    Uses the platform configuration to construct the command.

    See _construct_ssh_cmd for argument documentation.
    """
    return _remote_cylc_cmd(cmd,
                            host=get_host_from_platform(platform),
                            ssh_cmd=platform['ssh command'],
                            remote_cylc_path=platform['cylc path'],
                            ssh_login_shell=platform['use login shell'],
                            **kwargs)
Exemplo n.º 12
0
def construct_ssh_cmd(raw_cmd, platform, **kwargs):
    """Build an SSH command for execution on a remote platform.

    Constructs the SSH command according to the platform configuration.

    See _construct_ssh_cmd for argument documentation.
    """
    return _construct_ssh_cmd(raw_cmd,
                              host=get_host_from_platform(platform),
                              ssh_cmd=platform['ssh command'],
                              remote_cylc_path=platform['cylc path'],
                              ssh_login_shell=platform['use login shell'],
                              **kwargs)
Exemplo n.º 13
0
def construct_platform_ssh_cmd(raw_cmd, platform, **kwargs):
    """A wrapper around `construct_ssh_cmd` allowing us to pass a platform
    object rather than a user and host.

    Args:
        All as `construct_ssh_cmd` except for user and host.
    """
    ret = construct_ssh_cmd(raw_cmd,
                            host=get_host_from_platform(platform),
                            ssh_cmd=platform['ssh command'],
                            ssh_cylc=platform['cylc executable'],
                            ssh_login_shell=platform['use login shell'],
                            **kwargs)
    return ret
Exemplo n.º 14
0
def construct_rsync_over_ssh_cmd(src_path: str,
                                 dst_path: str,
                                 platform: Dict[str, Any],
                                 rsync_includes=None,
                                 bad_hosts=None) -> Tuple[List[str], str]:
    """Constructs the rsync command used for remote file installation.

    Includes as standard the directories: app, bin, etc, lib; and the server
    key, used for ZMQ authentication.

    Args:
        src_path: source path
        dst_path: path of target
        platform: contains info relating to platform
        rsync_includes: files and directories to be included in the rsync

    Developer Warning:
        The Cylc Subprocess Pool method ``rsync_255_fail`` relies on
        ``rsync_cmd[0] == 'rsync'``. Please check that changes to this funtion
        do not break ``rsync_255_fail``.
    """
    dst_host = get_host_from_platform(platform, bad_hosts=bad_hosts)
    ssh_cmd = platform['ssh command']
    command = platform['rsync command']
    rsync_cmd = shlex.split(command)
    rsync_options = [
        "--delete", "--rsh=" + ssh_cmd, "--include=/.service/",
        "--include=/.service/server.key"
    ] + DEFAULT_RSYNC_OPTS
    # Note to future devs - be wary of changing the order of the following
    # rsync options, rsync is very particular about order of in/ex-cludes.
    rsync_cmd.extend(rsync_options)
    for exclude in ['log', 'share', 'work']:
        rsync_cmd.append(f"--exclude={exclude}")
    default_includes = ['/app/***', '/bin/***', '/etc/***', '/lib/***']
    for include in default_includes:
        rsync_cmd.append(f"--include={include}")
    for include in get_includes_to_rsync(rsync_includes):
        rsync_cmd.append(f"--include={include}")
    # The following excludes are required in case these are added to the
    rsync_cmd.append("--exclude=*")  # exclude everything else
    rsync_cmd.append(f"{src_path}/")
    rsync_cmd.append(f"{dst_host}:{dst_path}/")
    return rsync_cmd, dst_host
Exemplo n.º 15
0
def construct_rsync_over_ssh_cmd(
        src_path, dst_path, platform, rsync_includes=None):
    """Constructs the rsync command used for remote file installation.

    Includes as standard the directories: app, bin, etc, lib; and the server
    key, used for ZMQ authentication.

    Args:
        src_path(string): source path
        dst_path(string): path of target
        platform(dict)): contains info relating to platform
        rsync_includes(list): files and directories to be included in the rsync

    """
    dst_host = get_host_from_platform(platform)
    ssh_cmd = platform['ssh command']
    rsync_cmd = [
        "rsync",
        "--delete",
        "--rsh=" + ssh_cmd,
        "--include=/.service/",
        "--include=/.service/server.key"
    ] + DEFAULT_RSYNC_OPTS
    # Note to future devs - be wary of changing the order of the following
    # rsync options, rsync is very particular about order of in/ex-cludes.

    for exclude in ['log', 'share', 'work']:
        rsync_cmd.append(f"--exclude={exclude}")
    default_includes = [
        '/app/***',
        '/bin/***',
        '/etc/***',
        '/lib/***']
    for include in default_includes:
        rsync_cmd.append(f"--include={include}")
    for include in get_includes_to_rsync(rsync_includes):
        rsync_cmd.append(f"--include={include}")
    # The following excludes are required in case these are added to the
    rsync_cmd.append("--exclude=*")  # exclude everything else
    rsync_cmd.append(f"{src_path}/")
    rsync_cmd.append(f"{dst_host}:{dst_path}/")
    return rsync_cmd
Exemplo n.º 16
0
    def get_task_auth(self, suite_name: str,
                      task_name: str) -> Union[str, None]:
        """Get host for a remote task from a Cylc workflow definition.

        Returns: Hostname, or None if:
          - task does not run remotely.
          - task has not been defined.
          - cylc-rose is not installed(*)

        (*) This function is only used by the fcm_make built-in app. Returning
        None is equivalent to there being no fcm_make2 task found or no
        workflow file found which is fine - 2 stage fcm_make is only supported
        on the localhost install target (the workflow files aren't mirrored).

        """
        # n.b. Imports inside function to avoid dependency on Cylc and
        # Cylc-Rose is Rose is being used with a different workflow engine.
        from cylc.flow.exceptions import WorkflowFilesError
        from cylc.flow.hostuserutil import is_remote_platform
        from cylc.flow.platforms import get_host_from_platform
        try:
            from cylc.rose.platform_utils import get_platform_from_task_def
        except ModuleNotFoundError:
            # Allow single stage fcm_make app to work without requiring
            # cylc.rose
            return None

        try:
            platform = get_platform_from_task_def(suite_name, task_name)
        except KeyError:
            return None
        except (WorkflowFilesError):
            raise WorkflowFileNotFoundError
        else:
            if platform is None:
                return 'localhost'
            # If task has been defined return host:
            if is_remote_platform(platform):
                return get_host_from_platform(platform)
            else:
                return None
Exemplo n.º 17
0
def test_get_host_from_platform_fails_no_goodhosts():
    platform = TEST_PLATFORM
    with pytest.raises(NoHostsError) as err:
        get_host_from_platform(platform, {'nellie', 'dumbo', 'jumbo'})
    assert err.exconly() == ('cylc.flow.exceptions.NoHostsError: '
                             'Unable to find valid host for Elephant')
Exemplo n.º 18
0
    def submit_task_jobs(self,
                         suite,
                         itasks,
                         curve_auth,
                         client_pub_key_dir,
                         is_simulation=False):
        """Prepare for job submission and submit task jobs.

        Preparation (host selection, remote host init, and remote install)
        is done asynchronously. Newly released tasks may be sent here several
        times until these init subprocesses have returned. Failure during
        preparation is considered to be job submission failure.

        Once preparation has completed or failed, reset .waiting_on_job_prep in
        task instances so the scheduler knows to stop sending them back here.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.subshell_eval_reset()

        if not prepared_tasks:
            return bad_tasks
        auth_itasks = {}  # {platform: [itask, ...], ...}
        for itask in prepared_tasks:
            platform_name = itask.platform['name']
            auth_itasks.setdefault(platform_name, [])
            auth_itasks[platform_name].append(itask)
        # Submit task jobs for each platform
        done_tasks = bad_tasks

        for platform_name, itasks in sorted(auth_itasks.items()):
            platform = itasks[0].platform
            install_target = get_install_target_from_platform(platform)
            ri_map = self.task_remote_mgr.remote_init_map

            if (ri_map.get(install_target) != REMOTE_FILE_INSTALL_DONE):
                if install_target == get_localhost_install_target():
                    # Skip init and file install for localhost.
                    LOG.debug(f"REMOTE INIT NOT REQUIRED for {install_target}")
                    ri_map[install_target] = (REMOTE_FILE_INSTALL_DONE)

                elif install_target not in ri_map:
                    # Remote init not in progress for target, so start it.
                    self.task_remote_mgr.remote_init(platform, curve_auth,
                                                     client_pub_key_dir)
                    for itask in itasks:
                        itask.set_summary_message(self.REMOTE_INIT_MSG)
                        self.data_store_mgr.delta_job_msg(
                            get_task_job_id(itask.point, itask.tdef.name,
                                            itask.submit_num),
                            self.REMOTE_INIT_MSG)
                    continue

                elif (ri_map[install_target] == REMOTE_INIT_DONE):
                    # Already done remote init so move on to file install
                    self.task_remote_mgr.file_install(platform)
                    continue

                elif (ri_map[install_target] in self.IN_PROGRESS.keys()):
                    # Remote init or file install in progress.
                    for itask in itasks:
                        msg = self.IN_PROGRESS[ri_map[install_target]]
                        itask.set_summary_message(msg)
                        self.data_store_mgr.delta_job_msg(
                            get_task_job_id(itask.point, itask.tdef.name,
                                            itask.submit_num), msg)
                    continue

            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            host = get_host_from_platform(platform)
            if (self.job_runner_mgr.is_job_local_to_host(
                    itask.summary['job_runner_name'])
                    and not is_remote_platform(platform)):
                host = get_host()

            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info('[%s] -submit-num=%02d, host=%s', itask,
                         itask.submit_num, host)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'platform_name': itask.platform['name'],
                        'job_runner_name': itask.summary['job_runner_name'],
                    })
                itask.is_manual_submit = False

            if (ri_map[install_target]
                    in [REMOTE_INIT_FAILED, REMOTE_FILE_INSTALL_FAILED]):
                # Remote init or install failed. Set submit-failed for all
                # affected tasks and remove target from remote init map
                # - this enables new tasks to re-initialise that target
                init_error = (ri_map[install_target])
                del ri_map[install_target]
                for itask in itasks:
                    itask.waiting_on_job_prep = False
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(self.JOBS_SUBMIT,
                                       '(init %s)' % host,
                                       err=init_error,
                                       ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self._prep_submit_task_job_error(suite, itask,
                                                     '(remote init)', '')

                continue
            # Build the "cylc jobs-submit" command
            cmd = [self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            if is_remote_platform(itask.platform):
                remote_mode = True
                cmd.append('--remote-mode')
            else:
                remote_mode = False
            if itask.platform['clean job submission environment']:
                cmd.append('--clean-env')
            for var in itask.platform[
                    'job submission environment pass-through']:
                cmd.append(f"--env={var}")
            for path in itask.platform[
                    'job submission executable paths'] + SYSPATH:
                cmd.append(f"--path={path}")
            cmd.append('--')
            cmd.append(get_remote_suite_run_job_dir(platform, suite))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = (len(itasks) // (
                (len(itasks) // platform['max batch submit size']) + 1) + 1)
            itasks_batches = [
                itasks[i:i + chunk_size]
                for i in range(0, len(itasks), chunk_size)
            ]
            LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd,
                      [len(b) for b in itasks_batches])

            if remote_mode:
                cmd = construct_ssh_cmd(cmd, platform)
            else:
                cmd = ['cylc'] + cmd

            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            os.path.expandvars(
                                get_task_job_job_log(suite, itask.point,
                                                     itask.tdef.name,
                                                     itask.submit_num)))
                    job_log_dirs.append(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)

                    itask.waiting_on_job_prep = False
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                    ), self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Exemplo n.º 19
0
def test_get_host_from_platform(badhosts, expect):
    platform = TEST_PLATFORM
    assert get_host_from_platform(platform, badhosts) == expect
Exemplo n.º 20
0
    def remote_init(self, platform: Dict[str, Any],
                    curve_auth: 'ThreadAuthenticator',
                    client_pub_key_dir: str) -> None:
        """Initialise a remote host if necessary.

        Call "cylc remote-init" to install workflow items to remote:
            ".service/contact": For TCP task communication
            "python/": if source exists

        Args:
            platform: A dict containing settings relating to platform used in
                this remote installation.
            curve_auth: The ZMQ authenticator.
            client_pub_key_dir: Client public key directory, used by the
                ZMQ authenticator.

        """
        install_target = platform['install target']
        if install_target == get_localhost_install_target():
            self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE
            return

        # Set status of install target to in progress while waiting for remote
        # initialisation to finish
        self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS

        # Determine what items to install
        comms_meth: CommsMeth = CommsMeth(platform['communication method'])
        items = self._remote_init_items(comms_meth)

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # Build the remote-init command to be run over ssh
        cmd = ['remote-init']
        cmd.extend(verbosity_to_opts(cylc.flow.flags.verbosity))
        cmd.append(str(install_target))
        cmd.append(get_remote_workflow_run_dir(self.workflow))
        dirs_to_symlink = get_dirs_to_symlink(install_target, self.workflow)
        for key, value in dirs_to_symlink.items():
            if value is not None:
                cmd.append(f"{key}={quote(value)} ")
        # Create the ssh command
        try:
            host = get_host_from_platform(platform, bad_hosts=self.bad_hosts)
        except NoHostsError as exc:
            LOG.error(
                PlatformError(
                    f'{PlatformError.MSG_INIT}\n{exc}',
                    platform['name'],
                ))
            self.remote_init_map[
                platform['install target']] = REMOTE_INIT_FAILED
            self.bad_hosts -= set(platform['hosts'])
            self.ready = True
        else:
            log_platform_event('remote init', platform, host)
            cmd = construct_ssh_cmd(cmd, platform, host)
            self.proc_pool.put_command(
                SubProcContext('remote-init',
                               cmd,
                               stdin_files=[tmphandle],
                               host=host),
                bad_hosts=self.bad_hosts,
                callback=self._remote_init_callback,
                callback_args=[
                    platform, tmphandle, curve_auth, client_pub_key_dir
                ],
                callback_255=self._remote_init_callback_255,
                callback_255_args=[platform])
Exemplo n.º 21
0
    def remote_init(self, platform, curve_auth,
                    client_pub_key_dir):
        """Initialise a remote [owner@]host if necessary.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": For TCP task communication
            "python/": if source exists

        Args:
            curve_auth (ThreadAuthenticator):
                The ZMQ authenticator.
            client_pub_key_dir (str):
                Client public key directory, used by the ZMQ authenticator.
            platform (dict):
                A dictionary containing settings relating to platform used in
                this remote installation.

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        self.install_target = platform['install target']

        # If task is running locally or the install target is localhost
        # we can skip the rest of this function
        if (self.install_target == 'localhost' or
                self.single_task_mode or
                not is_remote_host(get_host_from_platform(platform))):
            LOG.debug(f"REMOTE INIT NOT REQUIRED for {self.install_target}")
            return REMOTE_INIT_NOT_REQUIRED

        # See if a previous failed attempt to initialize this platform has
        # occurred.
        try:
            status = self.remote_init_map[platform['install target']]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[platform['install target']]
            return status

        # Determine what items to install
        comm_meth = platform['communication method']

        # Get a list of files and folders to install;
        # if nothing needs install say so to remote_init_map and return.
        items = self._remote_init_items(comm_meth)

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # Build the remote-init command to be run over ssh
        cmd = ['remote-init']
        if cylc.flow.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(str(self.install_target))
        cmd.append(get_remote_suite_run_dir(platform, self.suite))
        # Create the ssh command
        cmd = construct_platform_ssh_cmd(cmd, platform)

        self.proc_pool.put_command(
            SubProcContext(
                'remote-init',
                cmd,
                stdin_files=[tmphandle]),
            self._remote_init_callback,
            [platform, tmphandle,
             curve_auth, client_pub_key_dir])
        # None status: Waiting for command to finish
        self.remote_init_map[platform['install target']] = None
        return self.remote_init_map[platform['install target']]
Exemplo n.º 22
0
def main(parser, options, *args, color=False):
    """Implement cylc cat-log CLI.

    Determine log path, user@host, batchview_cmd, and action (print, dir-list,
    cat, edit, or tail), and then if the log path is:
      a) local: perform action on log path, or
      b) remote: re-invoke cylc cat-log as a) on the remote account

    """
    if options.remote_args:
        # Invoked on job hosts for job logs only, as a wrapper to view_log().
        # Tail and batchview commands come from global config on suite host).
        logpath, mode, tail_tmpl = options.remote_args[0:3]
        logpath = os.path.expandvars(logpath)
        tail_tmpl = os.path.expandvars(tail_tmpl)
        try:
            batchview_cmd = options.remote_args[3]
        except IndexError:
            batchview_cmd = None
        res = view_log(logpath,
                       mode,
                       tail_tmpl,
                       batchview_cmd,
                       remote=True,
                       color=color)
        if res == 1:
            sys.exit(res)
        return

    suite_name = args[0]
    # Get long-format mode.
    try:
        mode = MODES[options.mode]
    except KeyError:
        mode = options.mode

    if len(args) == 1:
        # Cat suite logs, local only.
        if options.filename is not None:
            raise UserInputError("The '-f' option is for job logs only.")

        logpath = get_suite_run_log_name(suite_name)
        if options.rotation_num:
            logs = glob('%s.*' % logpath)
            logs.sort(key=os.path.getmtime, reverse=True)
            try:
                logpath = logs[int(options.rotation_num)]
            except IndexError:
                raise UserInputError("max rotation %d" % (len(logs) - 1))
        tail_tmpl = os.path.expandvars(get_platform()["tail command template"])
        out = view_log(logpath, mode, tail_tmpl, color=color)
        if out == 1:
            sys.exit(1)
        if mode == 'edit':
            tmpfile_edit(out, options.geditor)
        return

    if len(args) == 2:
        # Cat task job logs, may be on suite or job host.
        if options.rotation_num is not None:
            raise UserInputError("only suite (not job) logs get rotated")
        task_id = args[1]
        try:
            task, point = TaskID.split(task_id)
        except ValueError:
            parser.error("Illegal task ID: %s" % task_id)
        if options.submit_num != NN:
            try:
                options.submit_num = "%02d" % int(options.submit_num)
            except ValueError:
                parser.error("Illegal submit number: %s" % options.submit_num)
        if options.filename is None:
            options.filename = JOB_LOG_OUT
        else:
            # Convert short filename args to long (e.g. 'o' to 'job.out').
            try:
                options.filename = JOB_LOG_OPTS[options.filename]
            except KeyError:
                # Is already long form (standard log, or custom).
                pass
        platform_name, batch_sys_name, live_job_id = get_task_job_attrs(
            suite_name, point, task, options.submit_num)
        platform = get_platform(platform_name)
        batchview_cmd = None
        if live_job_id is not None:
            # Job is currently running. Get special batch system log view
            # command (e.g. qcat) if one exists, and the log is out or err.
            conf_key = None
            if options.filename == JOB_LOG_OUT:
                if mode == 'cat':
                    conf_key = "out viewer"
                elif mode == 'tail':
                    conf_key = "out tailer"
            elif options.filename == JOB_LOG_ERR:
                if mode == 'cat':
                    conf_key = "err viewer"
                elif mode == 'tail':
                    conf_key = "err tailer"
            if conf_key is not None:
                batchview_cmd_tmpl = None
                try:
                    batchview_cmd_tmpl = platform[conf_key]
                except KeyError:
                    pass
                if batchview_cmd_tmpl is not None:
                    batchview_cmd = batchview_cmd_tmpl % {
                        "job_id": str(live_job_id)
                    }

        log_is_remote = (is_remote_platform(platform)
                         and (options.filename != JOB_LOG_ACTIVITY))
        log_is_retrieved = (platform['retrieve job logs']
                            and live_job_id is None)
        if log_is_remote and (not log_is_retrieved or options.force_remote):
            logpath = os.path.normpath(
                get_remote_suite_run_job_dir(platform, suite_name, point, task,
                                             options.submit_num,
                                             options.filename))
            tail_tmpl = platform["tail command template"]
            # Reinvoke the cat-log command on the remote account.
            cmd = ['cat-log']
            if cylc.flow.flags.debug:
                cmd.append('--debug')
            for item in [logpath, mode, tail_tmpl]:
                cmd.append('--remote-arg=%s' % quote(item))
            if batchview_cmd:
                cmd.append('--remote-arg=%s' % quote(batchview_cmd))
            cmd.append(suite_name)
            is_edit_mode = (mode == 'edit')
            try:
                host = get_host_from_platform(platform)
                proc = remote_cylc_cmd(cmd,
                                       host,
                                       capture_process=is_edit_mode,
                                       manage=(mode == 'tail'))
            except KeyboardInterrupt:
                # Ctrl-C while tailing.
                pass
            else:
                if is_edit_mode:
                    # Write remote stdout to a temp file for viewing in editor.
                    # Only BUFSIZE bytes at a time in case huge stdout volume.
                    out = NamedTemporaryFile()
                    data = proc.stdout.read(BUFSIZE)
                    while data:
                        out.write(data)
                        data = proc.stdout.read(BUFSIZE)
                    os.chmod(out.name, S_IRUSR)
                    out.seek(0, 0)
        else:
            # Local task job or local job log.
            logpath = os.path.normpath(
                get_suite_run_job_dir(suite_name, point, task,
                                      options.submit_num, options.filename))
            tail_tmpl = os.path.expandvars(platform["tail command template"])
            out = view_log(logpath,
                           mode,
                           tail_tmpl,
                           batchview_cmd,
                           color=color)
            if mode != 'edit':
                sys.exit(out)
        if mode == 'edit':
            tmpfile_edit(out, options.geditor)
Exemplo n.º 23
0
    def submit_task_jobs(self,
                         suite,
                         itasks,
                         curve_auth,
                         client_pub_key_dir,
                         is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.subshell_eval_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (install target)
        auth_itasks = {}  # {install target: [itask, ...], ...}

        for itask in prepared_tasks:
            install_target = get_install_target_from_platform(itask.platform)
            auth_itasks.setdefault(install_target, [])
            auth_itasks[install_target].append(itask)
        # Submit task jobs for each platform
        done_tasks = bad_tasks
        for install_target, itasks in sorted(auth_itasks.items()):
            # Re-fetch a copy of platform
            platform = itasks[0].platform
            is_init = self.task_remote_mgr.remote_init(platform, curve_auth,
                                                       client_pub_key_dir)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                    self.job_pool.add_job_msg(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num),
                        self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            host = get_host_from_platform(platform)
            if (self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name'])
                    and not is_remote_platform(platform)):
                host = get_host()

            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info('[%s] -submit-num=%02d, owner@host=%s', itask,
                         itask.submit_num, host)
                self.suite_db_mgr.put_insert_task_jobs(
                    itask, {
                        'is_manual_submit': itask.is_manual_submit,
                        'try_num': itask.get_try_num(),
                        'time_submit': now_str,
                        'platform_name': platform['name'],
                        'batch_sys_name': itask.summary['batch_sys_name'],
                    })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(self.JOBS_SUBMIT,
                                       '(init %s)' % host,
                                       err=REMOTE_INIT_FAILED,
                                       ret_code=1), suite, itask.point,
                        itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = [self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            if is_remote_platform(itask.platform):
                remote_mode = True
                cmd.append('--remote-mode')
            else:
                remote_mode = False
            cmd.append('--')
            cmd.append(get_remote_suite_run_job_dir(platform, suite))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size]
                for i in range(0, len(itasks), chunk_size)
            ]
            LOG.debug('%s ... # will invoke in batches, sizes=%s', cmd,
                      [len(b) for b in itasks_batches])

            if remote_mode:
                cmd = construct_platform_ssh_cmd(cmd, platform)
            else:
                cmd = ['cylc'] + cmd

            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            os.path.expandvars(
                                get_task_job_job_log(suite, itask.point,
                                                     itask.tdef.name,
                                                     itask.submit_num)))
                    job_log_dirs.append(
                        get_task_job_id(itask.point, itask.tdef.name,
                                        itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)

                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                    ), self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks