예제 #1
0
    def report(self, request, server_obj):
        """Log client requests with identifying information.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info request from each client.

        """
        if threading.current_thread().__class__.__name__ == '_MainThread':
            # Server methods may be called internally as well as by clients.
            return
        auth_user, prog_name, user, host, uuid, priv_level = get_client_info()
        name = server_obj.__class__.__name__
        log_me = (
            cylc.flags.debug or
            name in ["SuiteCommandServer",
                     "ExtTriggerServer",
                     "BroadcastServer"] or
            (name not in ["SuiteIdServer", "TaskMessageServer"] and
             uuid not in self.clients))
        if log_me:
            LOG.debug(
                self.__class__.LOG_CONNECT_ALLOWED_TMPL % (
                    user, host, prog_name, priv_level, uuid)
            )
            LOG.info(
                self.__class__.LOG_COMMAND_TMPL % (
                    request, user, host, prog_name, uuid))
        if name == "SuiteIdServer":
            self._num_id_requests += 1
            self.report_id_requests()
        self.clients[uuid] = datetime.datetime.utcnow()
        self._housekeep()
예제 #2
0
 def _set_state(self, status):
     """Set, log and record task status (normal change, not forced - don't
     update task_events table)."""
     if self.status == self.hold_swap:
         self.hold_swap = None
     if status == self.status and self.hold_swap is None:
         return
     o_status, o_hold_swap = self.status, self.hold_swap
     if status == TASK_STATUS_HELD:
         self.hold_swap = self.status
     elif status in TASK_STATUSES_ACTIVE:
         if self.status == TASK_STATUS_HELD:
             self.hold_swap = TASK_STATUS_HELD
     elif (self.hold_swap == TASK_STATUS_HELD
           and status not in TASK_STATUSES_FINAL):
         self.hold_swap = status
         status = TASK_STATUS_HELD
     elif self.hold_swap:
         self.hold_swap = None
     self.status = status
     self.time_updated = get_current_time_string()
     flags.iflag = True
     # Log
     message = str(o_status)
     if o_hold_swap:
         message += " (%s)" % o_hold_swap
     message += " => %s" % self.status
     if self.hold_swap:
         message += " (%s)" % self.hold_swap
     LOG.debug(message, itask=self.identity)
예제 #3
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
예제 #4
0
    def report(self, request, server_obj):
        """Log client requests with identifying information.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info request from each client.

        """
        if threading.current_thread().__class__.__name__ == '_MainThread':
            # Server methods may be called internally as well as by clients.
            return
        auth_user, prog_name, user, host, uuid, priv_level = get_client_info()
        name = server_obj.__class__.__name__
        log_me = (
            cylc.flags.debug or
            name in ["SuiteCommandServer",
                     "ExtTriggerServer",
                     "BroadcastServer"] or
            (name not in ["SuiteIdServer", "TaskMessageServer"] and
             uuid not in self.clients))
        if log_me:
            LOG.debug(
                self.__class__.LOG_CONNECT_ALLOWED_TMPL % (
                    user, host, prog_name, priv_level, uuid)
            )
            LOG.info(
                self.__class__.LOG_COMMAND_TMPL % (
                    request, user, host, prog_name, uuid))
        if name == "SuiteIdServer":
            self._num_id_requests += 1
            self.report_id_requests()
        self.clients[uuid] = datetime.datetime.utcnow()
        self._housekeep()
예제 #5
0
    def _check_access_priv_and_report(self,
                                      required_privilege_level,
                                      log_info=True):
        """Check access privilege and log requests with identifying info.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info command from each client.

        Return:
            dict: containing the client session

        """
        self._check_access_priv(required_privilege_level)
        command = inspect.currentframe().f_back.f_code.co_name
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL %
                  (user, host, prog_name, priv_level, uuid))
        if cylc.flags.debug or uuid not in self.clients and log_info:
            LOG.info(self.__class__.LOG_COMMAND_TMPL %
                     (command, user, host, prog_name, uuid))
        self.clients.setdefault(uuid, {})
        self.clients[uuid]['time'] = time()
        self._housekeep()
        return self.clients[uuid]
예제 #6
0
    def _remote_init_items(self, host, owner):
        """Return list of items that should be installed on task remote.

        Each item is (path, name),
        where name is relative path under suite run directory.
        """
        items = []
        comm_meth = glbl_cfg().get_host_item('task communication method', host,
                                             owner)
        LOG.debug('comm_meth=%s' % comm_meth)
        if comm_meth in ['ssh', 'http', 'https']:
            # Contact file
            items.append(
                (self.suite_srv_files_mgr.get_contact_file(self.suite),
                 os.path.join(self.suite_srv_files_mgr.DIR_BASE_SRV,
                              self.suite_srv_files_mgr.FILE_BASE_CONTACT)))
            if comm_meth in ['http', 'https']:
                # Passphrase file
                items.append(
                    (self.suite_srv_files_mgr.get_auth_item(
                        self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE,
                        self.suite),
                     os.path.join(
                         self.suite_srv_files_mgr.DIR_BASE_SRV,
                         self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE)))
            if comm_meth in ['https']:
                # SSL cert file
                items.append(
                    (self.suite_srv_files_mgr.get_auth_item(
                        self.suite_srv_files_mgr.FILE_BASE_SSL_CERT,
                        self.suite),
                     os.path.join(
                         self.suite_srv_files_mgr.DIR_BASE_SRV,
                         self.suite_srv_files_mgr.FILE_BASE_SSL_CERT)))
        return items
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        poll_me = []
        pollable = [
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED
        ]
        for itask in itasks:
            if itask.state.status in pollable or (
                    itask.state.status == TASK_STATUS_SUCCEEDED and poll_succ):
                poll_me.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if poll_me:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(self.JOBS_POLL, suite, poll_me,
                              self._poll_task_jobs_callback)
예제 #8
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(SuiteProcContext(self.JOBS_SUBMIT,
                                            action,
                                            err=exc,
                                            ret_code=1),
                           suite,
                           itask.point,
                           itask.tdef.name,
                           submit_num=itask.submit_num)
     if not dry_run:
         # Perist
         self.suite_db_mgr.put_insert_task_jobs(
             itask, {
                 'is_manual_submit': itask.is_manual_submit,
                 'try_num': itask.get_try_num(),
                 'time_submit': get_current_time_string(),
                 'batch_sys_name': itask.summary.get('batch_sys_name'),
             })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
예제 #9
0
파일: task_state.py 프로젝트: m214089/cylc
 def _set_state(self, status):
     """Set, log and record task status (normal change, not forced - don't
     update task_events table)."""
     if self.status == self.hold_swap:
         self.hold_swap = None
     if status == self.status and self.hold_swap is None:
         return
     o_status, o_hold_swap = self.status, self.hold_swap
     if status == TASK_STATUS_HELD:
         self.hold_swap = self.status
     elif (self.hold_swap == TASK_STATUS_HELD and
             status not in TASK_STATUSES_FINAL):
         self.hold_swap = status
         status = TASK_STATUS_HELD
     elif self.hold_swap:
         self.hold_swap = None
     self.status = status
     self.time_updated = get_current_time_string()
     flags.iflag = True
     # Log
     message = str(o_status)
     if o_hold_swap:
         message += " (%s)" % o_hold_swap
     message += " => %s" % self.status
     if self.hold_swap:
         message += " (%s)" % self.hold_swap
     LOG.debug(message, itask=self.identity)
예제 #10
0
 def __init__(self, pool_size=None):
     self.pool_size = (pool_size or GLOBAL_CFG.get(["process pool size"])
                       or multiprocessing.cpu_count())
     # (The Pool class defaults to cpu_count anyway, but does not
     # expose the result via its public interface).
     LOG.debug("Initializing process pool, size %d" % self.pool_size)
     self.pool = multiprocessing.Pool(processes=self.pool_size)
     self.results = {}
예제 #11
0
 def _housekeep(self):
     """Forget inactive clients."""
     for uuid, client_info in self.clients.copy().items():
         if time() - client_info['time'] > self.CLIENT_FORGET_SEC:
             try:
                 del self.clients[uuid]
             except KeyError:
                 pass
             LOG.debug(self.LOG_FORGET_TMPL % uuid)
예제 #12
0
 def join(self):
     """Join after workers have exited. Close or terminate first."""
     LOG.debug("Joining process pool")
     try:
         self.pool.join()
     except AssertionError:
         # multiprocessing.Pool.join may raise this error. We want to ignore
         # this so suite shutdown can continue.
         pass
예제 #13
0
파일: httpserver.py 프로젝트: arjclark/cylc
 def _housekeep(self):
     """Forget inactive clients."""
     for uuid, client_info in self.clients.copy().items():
         if time() - client_info['time'] > self.CLIENT_FORGET_SEC:
             try:
                 del self.clients[uuid]
             except KeyError:
                 pass
             LOG.debug(self.LOG_FORGET_TMPL % uuid)
    def _prep_submit_task_job(self, suite, itask, dry_run):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.
        try:
            task_host = self.task_remote_mgr.remote_host_select(
                rtconfig['remote']['host'])
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['submit_num'] = itask.submit_num
            self._prep_submit_task_job_error(suite, itask, dry_run,
                                             '(remote host select)', exc)
            return False
        else:
            if task_host is None:  # host select not ready
                itask.summary['latest_message'] = self.REMOTE_SELECT_MSG
                return
            itask.task_host = task_host

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)
            local_job_file_path = self.task_events_mgr.get_task_job_log(
                suite, itask.point, itask.tdef.name, itask.submit_num,
                self.JOB_FILE_BASE)
            self.job_file_writer.write(local_job_file_path, job_conf)
        except StandardError as exc:
            # Could be a bad command template, IOError, etc
            self._prep_submit_task_job_error(suite, itask, dry_run,
                                             '(prepare job file)', exc)
            return False
        itask.local_job_file_path = local_job_file_path

        if dry_run:
            # This will be shown next to submit num in gcylc:
            itask.summary['latest_message'] = 'job file written (edit/dry-run)'
            LOG.debug(itask.summary['latest_message'], itask=itask)

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask
예제 #15
0
    def _housekeep(self):
        """Forget inactive clients."""

        for uuid in self.clients.keys():
            dtime = self.clients[uuid]
            if (self._total_seconds(datetime.datetime.utcnow() - dtime) >
                    self.__class__.CLIENT_FORGET_SEC):
                del self.clients[uuid]
                LOG.debug(
                    self.__class__.LOG_FORGET_TMPL % uuid)
예제 #16
0
파일: httpserver.py 프로젝트: arjclark/cylc
 def signout(self):
     """Forget client, where possible."""
     uuid = _get_client_info()[4]
     try:
         del self.clients[uuid]
     except KeyError:
         return False
     else:
         LOG.debug(self.LOG_FORGET_TMPL % uuid)
         return True
예제 #17
0
    def _housekeep(self):
        """Forget inactive clients."""

        for uuid in self.clients.keys():
            dtime = self.clients[uuid]
            if (self._total_seconds(datetime.datetime.utcnow() - dtime) >
                    self.__class__.CLIENT_FORGET_SEC):
                del self.clients[uuid]
                LOG.debug(
                    self.__class__.LOG_FORGET_TMPL % uuid)
예제 #18
0
 def signout(self):
     """Forget client, where possible."""
     uuid = _get_client_info()[4]
     try:
         del self.clients[uuid]
     except KeyError:
         return False
     else:
         LOG.debug(self.LOG_FORGET_TMPL % uuid)
         return True
예제 #19
0
파일: mp_pool.py 프로젝트: m214089/cylc
 def __init__(self, pool_size=None):
     self.pool_size = (
         pool_size or
         GLOBAL_CFG.get(["process pool size"]) or
         multiprocessing.cpu_count())
     # (The Pool class defaults to cpu_count anyway, but does not
     # expose the result via its public interface).
     LOG.debug(
         "Initializing process pool, size %d" % self.pool_size)
     self.pool = multiprocessing.Pool(processes=self.pool_size)
     self.results = {}
예제 #20
0
    def _housekeep(self):
        """Forget inactive clients."""

        for uuid, dtime in self.clients.copy().items():
            if (self._total_seconds(datetime.datetime.utcnow() - dtime) >
                    self.__class__.CLIENT_FORGET_SEC):
                try:
                    del self.clients[uuid]
                except KeyError:
                    pass
                LOG.debug(
                    self.__class__.LOG_FORGET_TMPL % uuid)
예제 #21
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
예제 #22
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
예제 #23
0
def _run_command(ctx):
    """Execute a shell command and capture its output and exit status."""

    LOG.debug(ctx)

    if (SuiteProcPool.STOP_JOB_SUBMISSION.value
            and ctx.cmd_key == SuiteProcPool.JOBS_SUBMIT):
        ctx.err = "job submission skipped (suite stopping)"
        ctx.ret_code = SuiteProcPool.JOB_SKIPPED_FLAG
        ctx.timestamp = get_current_time_string()
        return ctx

    try:
        if ctx.cmd_kwargs.get('stdin_file_paths'):
            if len(ctx.cmd_kwargs['stdin_file_paths']) > 1:
                stdin_file = TemporaryFile()
                for file_path in ctx.cmd_kwargs['stdin_file_paths']:
                    stdin_file.write(open(file_path, 'rb').read())
                stdin_file.seek(0)
            else:
                stdin_file = open(ctx.cmd_kwargs['stdin_file_paths'][0], 'rb')
        elif ctx.cmd_kwargs.get('stdin_str'):
            stdin_file = PIPE
        else:
            stdin_file = open(os.devnull)
        proc = Popen(ctx.cmd,
                     stdin=stdin_file,
                     stdout=PIPE,
                     stderr=PIPE,
                     env=ctx.cmd_kwargs.get('env'),
                     shell=ctx.cmd_kwargs.get('shell'))
    except IOError as exc:
        if cylc.flags.debug:
            traceback.print_exc()
        ctx.ret_code = 1
        ctx.err = str(exc)
    except OSError as exc:
        if exc.filename is None:
            exc.filename = ctx.cmd[0]
        if cylc.flags.debug:
            traceback.print_exc()
        ctx.ret_code = 1
        ctx.err = str(exc)
    else:
        ctx.out, ctx.err = proc.communicate(ctx.cmd_kwargs.get('stdin_str'))
        ctx.ret_code = proc.wait()

    ctx.timestamp = get_current_time_string()
    return ctx
 def _manip_task_jobs_callback(ctx,
                               suite,
                               itasks,
                               summary_callback,
                               more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     tasks = {}
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     for itask in itasks:
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
         # Something is very wrong here
         # Fallback to use "job_log_dirs" list to report the problem
         job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", [])
         for job_log_dir in job_log_dirs:
             point, name, submit_num = job_log_dir.split(os.sep, 2)
             itask = tasks[(point, name, submit_num)]
             out += (BatchSysManager.OUT_PREFIX_SUMMARY +
                     "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (KeyError, ValueError):
                     if cylc.flags.debug:
                         LOG.warning('Unhandled %s output: %s' %
                                     (ctx.cmd_key, line))
                         LOG.warning(traceback.format_exc())
예제 #25
0
    def callback(self, ctx):
        """Callback for asynchronous xtrigger functions.

        Record satisfaction status and function results dict.

        """
        LOG.debug(ctx)
        sig = ctx.get_signature()
        self.active.remove(sig)
        try:
            satisfied, results = json.loads(ctx.out)
        except ValueError:
            return
        LOG.debug('%s: returned %s' % (sig, results))
        if satisfied:
            self.pflag = True
            self.sat_xtrig[sig] = results
예제 #26
0
 def _manip_task_jobs_callback(
         ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     tasks = {}
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     for itask in itasks:
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
         # Something is very wrong here
         # Fallback to use "job_log_dirs" list to report the problem
         job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", [])
         for job_log_dir in job_log_dirs:
             point, name, submit_num = job_log_dir.split(os.sep, 2)
             itask = tasks[(point, name, submit_num)]
             out += (BatchSysManager.OUT_PREFIX_SUMMARY +
                     "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (KeyError, ValueError):
                     if cylc.flags.debug:
                         LOG.warning('Unhandled %s output: %s' % (
                             ctx.cmd_key, line))
                         LOG.warning(traceback.format_exc())
예제 #27
0
 def _remote_init_callback(self, proc_ctx, host, owner, tmphandle):
     """Callback when "cylc remote-init" exits"""
     self.ready = True
     tmphandle.close()
     if proc_ctx.ret_code == 0:
         for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED):
             if status in proc_ctx.out:
                 # Good status
                 LOG.debug(proc_ctx)
                 self.remote_init_map[(host, owner)] = status
                 return
     # Bad status
     LOG.error(TaskRemoteMgmtError(
         TaskRemoteMgmtError.MSG_INIT,
         (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd),
         proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))
     LOG.error(proc_ctx)
     self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
예제 #28
0
 def log_task_job_activity(self, ctx, suite, point, name, submit_num=NN):
     """Log an activity for a task job."""
     ctx_str = str(ctx)
     if not ctx_str:
         return
     if isinstance(ctx.cmd_key, tuple):  # An event handler
         submit_num = ctx.cmd_key[-1]
     job_activity_log = self.get_task_job_activity_log(
         suite, point, name, submit_num)
     try:
         with open(job_activity_log, "ab") as handle:
             handle.write(ctx_str + '\n')
     except IOError as exc:
         LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
     if ctx.cmd and ctx.ret_code:
         LOG.error(ctx_str)
     elif ctx.cmd:
         LOG.debug(ctx_str)
예제 #29
0
 def _remote_init_callback(self, proc_ctx, host, owner, tmphandle):
     """Callback when "cylc remote-init" exits"""
     self.ready = True
     tmphandle.close()
     if proc_ctx.ret_code == 0:
         for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED):
             if status in proc_ctx.out:
                 # Good status
                 LOG.debug(proc_ctx)
                 self.remote_init_map[(host, owner)] = status
                 return
     # Bad status
     LOG.error(
         TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, (host, owner),
                             ' '.join(quote(item) for item in proc_ctx.cmd),
                             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))
     LOG.error(proc_ctx)
     self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
예제 #30
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(
         SuiteProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1),
         suite, itask.point, itask.tdef.name, submit_num=itask.submit_num)
     if not dry_run:
         # Perist
         self.suite_db_mgr.put_insert_task_jobs(itask, {
             'is_manual_submit': itask.is_manual_submit,
             'try_num': itask.get_try_num(),
             'time_submit': get_current_time_string(),
             'batch_sys_name': itask.summary.get('batch_sys_name'),
         })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
예제 #31
0
파일: mp_pool.py 프로젝트: m214089/cylc
def _run_command(ctx):
    """Execute a shell command and capture its output and exit status."""

    LOG.debug(ctx)

    if (SuiteProcPool.STOP_JOB_SUBMISSION.value and
            ctx.cmd_key == SuiteProcPool.JOBS_SUBMIT):
        ctx.err = "job submission skipped (suite stopping)"
        ctx.ret_code = SuiteProcPool.JOB_SKIPPED_FLAG
        ctx.timestamp = get_current_time_string()
        return ctx

    try:
        stdin_file = None
        if ctx.cmd_kwargs.get('stdin_file_paths'):
            stdin_file = TemporaryFile()
            for file_path in ctx.cmd_kwargs['stdin_file_paths']:
                for line in open(file_path):
                    stdin_file.write(line)
            stdin_file.seek(0)
        elif ctx.cmd_kwargs.get('stdin_str'):
            stdin_file = PIPE
        proc = Popen(
            ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE,
            env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell'))
    except IOError as exc:
        if cylc.flags.debug:
            traceback.print_exc()
        ctx.ret_code = 1
        ctx.err = str(exc)
    except OSError as exc:
        if exc.filename is None:
            exc.filename = ctx.cmd[0]
        if cylc.flags.debug:
            traceback.print_exc()
        ctx.ret_code = 1
        ctx.err = str(exc)
    else:
        ctx.out, ctx.err = proc.communicate(ctx.cmd_kwargs.get('stdin_str'))
        ctx.ret_code = proc.wait()

    ctx.timestamp = get_current_time_string()
    return ctx
예제 #32
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(suite, point, name,
                                                 submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write(ctx_str + '\n')
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.debug(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
예제 #33
0
파일: mp_pool.py 프로젝트: wenlien/cylc
 def _run_command_init(cls, ctx, callback=None, callback_args=None):
     """Prepare and launch shell command in ctx."""
     try:
         if ctx.cmd_kwargs.get('stdin_file_paths'):
             if len(ctx.cmd_kwargs['stdin_file_paths']) > 1:
                 stdin_file = TemporaryFile()
                 for file_path in ctx.cmd_kwargs['stdin_file_paths']:
                     stdin_file.write(open(file_path, 'rb').read())
                 stdin_file.seek(0)
             else:
                 stdin_file = open(ctx.cmd_kwargs['stdin_file_paths'][0],
                                   'rb')
         elif ctx.cmd_kwargs.get('stdin_str'):
             stdin_file = TemporaryFile()
             stdin_file.write(ctx.cmd_kwargs.get('stdin_str'))
             stdin_file.seek(0)
         else:
             stdin_file = open(os.devnull)
         proc = Popen(
             ctx.cmd,
             stdin=stdin_file,
             stdout=PIPE,
             stderr=PIPE,
             # Execute command as a process group leader,
             # so we can use "os.killpg" to kill the whole group.
             preexec_fn=os.setpgrp,
             env=ctx.cmd_kwargs.get('env'),
             shell=ctx.cmd_kwargs.get('shell'))
     except (IOError, OSError) as exc:
         if exc.filename is None:
             exc.filename = ctx.cmd[0]
         LOG.exception(exc)
         ctx.ret_code = 1
         ctx.err = str(exc)
         cls._run_command_exit(ctx, callback, callback_args)
         return None
     else:
         LOG.debug(ctx.cmd)
         return proc
예제 #34
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(
        suite, point, name, submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write(ctx_str + '\n')
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.debug(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
예제 #35
0
파일: httpserver.py 프로젝트: arjclark/cylc
    def _check_access_priv_and_report(
            self, required_privilege_level, log_info=True):
        """Check access privilege and log requests with identifying info.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info command from each client.

        Return:
            dict: containing the client session

        """
        self._check_access_priv(required_privilege_level)
        command = inspect.currentframe().f_back.f_code.co_name
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL % (
            user, host, prog_name, priv_level, uuid))
        if cylc.flags.debug or uuid not in self.clients and log_info:
            LOG.info(self.__class__.LOG_COMMAND_TMPL % (
                command, user, host, prog_name, uuid))
        self.clients.setdefault(uuid, {})
        self.clients[uuid]['time'] = time()
        self._housekeep()
        return self.clients[uuid]
예제 #36
0
파일: mp_pool.py 프로젝트: arjclark/cylc
 def _run_command_init(cls, ctx, callback=None, callback_args=None):
     """Prepare and launch shell command in ctx."""
     try:
         if ctx.cmd_kwargs.get('stdin_file_paths'):
             if len(ctx.cmd_kwargs['stdin_file_paths']) > 1:
                 stdin_file = TemporaryFile()
                 for file_path in ctx.cmd_kwargs['stdin_file_paths']:
                     stdin_file.write(open(file_path, 'rb').read())
                 stdin_file.seek(0)
             else:
                 stdin_file = open(
                     ctx.cmd_kwargs['stdin_file_paths'][0], 'rb')
         elif ctx.cmd_kwargs.get('stdin_str'):
             stdin_file = TemporaryFile()
             stdin_file.write(ctx.cmd_kwargs.get('stdin_str'))
             stdin_file.seek(0)
         else:
             stdin_file = open(os.devnull)
         proc = Popen(
             ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE,
             # Execute command as a process group leader,
             # so we can use "os.killpg" to kill the whole group.
             preexec_fn=os.setpgrp,
             env=ctx.cmd_kwargs.get('env'),
             shell=ctx.cmd_kwargs.get('shell'))
     except (IOError, OSError) as exc:
         if exc.filename is None:
             exc.filename = ctx.cmd[0]
         LOG.exception(exc)
         ctx.ret_code = 1
         ctx.err = str(exc)
         cls._run_command_exit(ctx, callback, callback_args)
         return None
     else:
         LOG.debug(ctx.cmd)
         return proc
예제 #37
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": HTTP(S) and SSH+HTTP(S) task comm
            ".service/passphrase": HTTP(S) task comm
            ".service/ssl.cert": HTTPS task comm
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item(
            'task communication method', host, owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create "stdin_file_paths" file, with "items" in it.
        tmphandle = NamedTemporaryFile()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(self.uuid_str)
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(self.uuid_str)
        cmd.append(glbl_cfg().get_derived_host_item(
            self.suite, 'suite run directory', host, owner))
        self.proc_pool.put_command(
            SuiteProcContext(
                'remote-init', cmd, stdin_file_paths=[tmphandle.name]),
            self._remote_init_callback,
            [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
예제 #38
0
    def _prep_submit_task_job_impl(self, suite, itask):
        """Helper for self._prep_submit_task_job."""
        overrides = BroadcastServer.get_inst().get(itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides)
        else:
            rtconfig = itask.tdef.rtconfig

        # Retry delays, needed for the try_num
        self._set_retry_timers(itask, rtconfig)

        # Submit number and try number
        LOG.debug("[%s] -incrementing submit number" % (itask.identity,))
        itask.submit_num += 1
        itask.summary['submit_num'] = itask.submit_num
        itask.local_job_file_path = None
        self.suite_db_mgr.put_insert_task_jobs(itask, {
            "is_manual_submit": itask.is_manual_submit,
            "try_num": itask.get_try_num(),
            "time_submit": get_current_time_string(),
        })

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(expandvars(name))

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.

        # host may be None (= run task on suite host)
        itask.task_host = get_task_host(rtconfig['remote']['host'])
        if not itask.task_host:
            itask.task_host = 'localhost'
        elif itask.task_host != "localhost":
            LOG.info("[%s] -Task host: %s" % (
                itask.identity, itask.task_host))

        itask.task_owner = rtconfig['remote']['owner']

        if itask.task_owner:
            user_at_host = itask.task_owner + "@" + itask.task_host
        else:
            user_at_host = itask.task_host
        itask.summary['host'] = user_at_host
        itask.summary['job_hosts'][itask.submit_num] = user_at_host
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[rtconfig['job']['batch system']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass
        if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
            # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10
            # minutes after time limit exceeded
            itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (
                TaskActionTimer(delays=batch_sys_conf.get(
                    'execution time limit polling intervals', [60, 120, 420])))
        for label, key in [
                ('submission polling intervals', TASK_STATUS_SUBMITTED),
                ('execution polling intervals', TASK_STATUS_RUNNING)]:
            if key in itask.poll_timers:
                itask.poll_timers[key].reset()
            else:
                values = self.task_events_mgr.get_host_conf(
                    itask, label, skey='job')
                if values:
                    itask.poll_timers[key] = TaskActionTimer(delays=values)

        self.init_host(suite, itask.task_host, itask.task_owner)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "user_at_host": user_at_host,
            "batch_sys_name": itask.summary['batch_sys_name'],
        })
        itask.is_manual_submit = False

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = self.task_events_mgr.get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = os.path.join(
            GLOBAL_CFG.get_derived_host_item(
                suite, "suite job log directory",
                itask.task_host, itask.task_owner),
            job_d, self.JOB_FILE_BASE)
        return {
            'batch_system_name': rtconfig['job']['batch system'],
            'batch_submit_command_template': (
                rtconfig['job']['batch submit command template']),
            'batch_system_conf': batch_sys_conf,
            'directives': rtconfig['directives'],
            'environment': rtconfig['environment'],
            'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script': rtconfig['env-script'],
            'err-script': rtconfig['err-script'],
            'host': itask.task_host,
            'init-script': rtconfig['init-script'],
            'job_file_path': job_file_path,
            'job_d': job_d,
            'namespace_hierarchy': itask.tdef.namespace_hierarchy,
            'owner': itask.task_owner,
            'param_var': itask.tdef.param_var,
            'post-script': scripts[2],
            'pre-script': scripts[0],
            'remote_suite_d': rtconfig['remote']['suite definition directory'],
            'script': scripts[1],
            'shell': rtconfig['job']['shell'],
            'submit_num': itask.submit_num,
            'suite_name': suite,
            'task_id': itask.identity,
            'try_num': itask.get_try_num(),
            'work_d': rtconfig['work sub-directory'],
        }
예제 #39
0
파일: mp_pool.py 프로젝트: m214089/cylc
 def close(self):
     """Close the pool to new commands."""
     if not (self.is_dead() or self.is_closed()):
         LOG.debug("Closing process pool")
         self.pool.close()
예제 #40
0
    def process_events(self, schd_ctx):
        """Process task events that were created by "setup_event_handlers".

        schd_ctx is an instance of "Schduler" in "cylc.scheduler".
        """
        ctx_groups = {}
        now = time()
        for id_key, timer in self.event_timers.copy().items():
            key1, point, name, submit_num = id_key
            if timer.is_waiting:
                continue
            # Set timer if timeout is None.
            if not timer.is_timeout_set():
                if timer.next() is None:
                    LOG.warning("%s/%s/%02d %s failed" %
                                (point, name, submit_num, key1))
                    del self.event_timers[id_key]
                    continue
                # Report retries and delayed 1st try
                tmpl = None
                if timer.num > 1:
                    tmpl = "%s/%s/%02d %s failed, retrying in %s"
                elif timer.delay:
                    tmpl = "%s/%s/%02d %s will run after %s"
                if tmpl:
                    LOG.debug(tmpl % (point, name, submit_num, key1,
                                      timer.delay_timeout_as_str()))
            # Ready to run?
            if not timer.is_delay_done() or (
                    # Avoid flooding user's mail box with mail notification.
                    # Group together as many notifications as possible within a
                    # given interval.
                    timer.ctx.ctx_type == self.HANDLER_MAIL
                    and not schd_ctx.stop_mode
                    and self.next_mail_time is not None
                    and self.next_mail_time > now):
                continue

            timer.set_waiting()
            if timer.ctx.ctx_type == self.HANDLER_CUSTOM:
                # Run custom event handlers on their own
                self.proc_pool.put_command(
                    SuiteProcContext(
                        (key1, submit_num),
                        timer.ctx.cmd,
                        env=os.environ,
                        shell=True,
                    ), self._custom_handler_callback, [schd_ctx, id_key])
            else:
                # Group together built-in event handlers, where possible
                if timer.ctx not in ctx_groups:
                    ctx_groups[timer.ctx] = []
                ctx_groups[timer.ctx].append(id_key)

        next_mail_time = now + self.mail_interval
        for ctx, id_keys in ctx_groups.items():
            if ctx.ctx_type == self.HANDLER_MAIL:
                # Set next_mail_time if any mail sent
                self.next_mail_time = next_mail_time
                self._process_event_email(schd_ctx, ctx, id_keys)
            elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE:
                self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
예제 #41
0
    def _setup_custom_event_handlers(self, itask, event, message):
        """Set up custom task event handlers."""
        handlers = self._get_events_conf(itask, event + ' handler')
        if (handlers is None and
                event in self._get_events_conf(itask, 'handler events', [])):
            handlers = self._get_events_conf(itask, 'handlers')
        if handlers is None:
            return
        retry_delays = self._get_events_conf(
            itask,
            'handler retry delays',
            self.get_host_conf(itask, "task event handler retry delays"))
        if not retry_delays:
            retry_delays = [0]
        # There can be multiple custom event handlers
        for i, handler in enumerate(handlers):
            key1 = ("%s-%02d" % (self.HANDLER_CUSTOM, i), event)
            id_key = (
                key1, str(itask.point), itask.tdef.name, itask.submit_num)
            if id_key in self.event_timers:
                continue
            # Note: user@host may not always be set for a submit number, e.g.
            # on late event or if host select command fails. Use null string to
            # prevent issues in this case.
            user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '')
            if user_at_host and '@' not in user_at_host:
                # (only has 'user@' on the front if user is not suite owner).
                user_at_host = '%s@%s' % (get_user(), user_at_host)
            # Custom event handler can be a command template string
            # or a command that takes 4 arguments (classic interface)
            # Note quote() fails on None, need str(None).
            try:
                handler_data = {
                    "event": quote(event),
                    "suite": quote(self.suite),
                    "point": quote(str(itask.point)),
                    "name": quote(itask.tdef.name),
                    "submit_num": itask.submit_num,
                    "id": quote(itask.identity),
                    "message": quote(message),
                    "batch_sys_name": quote(
                        str(itask.summary['batch_sys_name'])),
                    "batch_sys_job_id": quote(
                        str(itask.summary['submit_method_id'])),
                    "submit_time": quote(
                        str(itask.summary['submitted_time_string'])),
                    "start_time": quote(
                        str(itask.summary['started_time_string'])),
                    "finish_time": quote(
                        str(itask.summary['finished_time_string'])),
                    "user@host": quote(user_at_host)
                }

                if self.suite_cfg:
                    for key, value in self.suite_cfg.items():
                        if key == "URL":
                            handler_data["suite_url"] = quote(value)
                        else:
                            handler_data["suite_" + key] = quote(value)

                if itask.tdef.rtconfig['meta']:
                    for key, value in itask.tdef.rtconfig['meta'].items():
                        if key == "URL":
                            handler_data["task_url"] = quote(value)
                        handler_data[key] = quote(value)

                cmd = handler % (handler_data)
            except KeyError as exc:
                message = "%s/%s/%02d %s bad template: %s" % (
                    itask.point, itask.tdef.name, itask.submit_num, key1, exc)
                LOG.error(message)
                continue

            if cmd == handler:
                # Nothing substituted, assume classic interface
                cmd = "%s '%s' '%s' '%s' '%s'" % (
                    handler, event, self.suite, itask.identity, message)
            LOG.debug("Queueing %s handler: %s" % (event, cmd), itask=itask)
            self.event_timers[id_key] = (
                TaskActionTimer(
                    CustomTaskEventHandlerContext(
                        key1,
                        self.HANDLER_CUSTOM,
                        cmd,
                    ),
                    retry_delays))
예제 #42
0
    def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.
        try:
            task_host = self.task_remote_mgr.remote_host_select(
                rtconfig['remote']['host'])
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['submit_num'] = itask.submit_num
            itask.summary['job_hosts'][itask.submit_num] = ''
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(remote host select)', exc)
            return False
        else:
            if task_host is None:  # host select not ready
                itask.summary['latest_message'] = self.REMOTE_SELECT_MSG
                return
            itask.task_host = task_host
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['submit_num'] = itask.submit_num
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)
            local_job_file_path = get_task_job_job_log(
                suite, itask.point, itask.tdef.name, itask.submit_num)
            self.job_file_writer.write(local_job_file_path, job_conf,
                                       check_syntax=check_syntax)
        except StandardError as exc:
            # Could be a bad command template, IOError, etc
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(prepare job file)', exc)
            return False
        itask.local_job_file_path = local_job_file_path

        if dry_run:
            # This will be shown next to submit num in gcylc:
            itask.summary['latest_message'] = 'job file written (edit/dry-run)'
            LOG.debug(itask.summary['latest_message'], itask=itask)

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask
예제 #43
0
파일: mp_pool.py 프로젝트: m214089/cylc
 def terminate(self):
     """Kill all worker processes immediately."""
     if not self.is_dead():
         LOG.debug("Terminating process pool")
         self.pool.terminate()
예제 #44
0
    def process_message(self, itask, severity, message, poll_func,
                        poll_event_time=None, is_incoming=False):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        """

        is_polled = poll_event_time is not None
        # Log incoming messages with '>' to distinguish non-message log entries
        message_flag = ""
        if is_incoming:
            message_flag = self.INCOMING_FLAG
        log_message = '(current:%s)%s %s' % (
            itask.state.status, message_flag, message)
        if poll_event_time is not None:
            log_message += ' %s' % self.POLLED_INDICATOR
        LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask)

        # Strip the "at TIME" suffix.
        event_time = poll_event_time
        if not event_time:
            match = self.RE_MESSAGE_TIME.match(message)
            if match:
                message, event_time = match.groups()
        if not event_time:
            event_time = get_current_time_string()

        # always update the suite state summary for latest message
        itask.summary['latest_message'] = message
        if is_polled:
            itask.summary['latest_message'] += " %s" % self.POLLED_INDICATOR
        cylc.flags.iflag = True

        # Satisfy my output, if possible, and record the result.
        an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func):
                return
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func):
                return
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if self._poll_to_confirm(itask,
                                     TASK_STATUS_SUBMIT_FAILED, poll_func):
                return
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func):
                return
            self._process_message_submitted(itask, event_time)
        elif message.startswith(TaskMessage.FAIL_MESSAGE_PREFIX):
            # Task received signal.
            signal = message[len(TaskMessage.FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": signal})
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(TaskMessage.ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            aborted_with = message[len(TaskMessage.ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(TaskMessage.VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_event_time('started')  # reset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            try:
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = (
                    itask.summary['submitted_time'] +
                    float(self._get_events_conf(itask, 'submission timeout')))
            except (TypeError, ValueError):
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = None
            # Believe this and change state without polling (could poll?).
            cylc.flags.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
        elif an_output_was_satisfied:
            # Message of an as-yet unreported custom task output.
            # No state change.
            cylc.flags.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug(
                '(current: %s) unhandled: %s' % (itask.state.status, message),
                itask=itask)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(
                itask, ("message %s" % str(severity).lower()), message)
        if severity == "CUSTOM":
            self.setup_event_handlers(itask, "custom", message)
        elif severity in [TaskMessage.WARNING, TaskMessage.CRITICAL]:
            self.setup_event_handlers(itask, severity.lower(), message)
예제 #45
0
 def close(self):
     """Close the pool to new commands."""
     if not (self.is_dead() or self.is_closed()):
         LOG.debug("Closing process pool")
         self.pool.close()
예제 #46
0
    def process_message(
            self, itask, severity, message, event_time=None, flag='',
            submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        Arguments:
            itask (cylc.task_proxy.TaskProxy):
                The task proxy object relevant for the message.
            severity (str or int):
                Message severity, should be a recognised logging level.
            message (str):
                Message content.
            event_time (str):
                Event time stamp. Expect ISO8601 date time string.
                If not specified, use current time.
            flag (str):
                If specified, can be INCOMING_FLAG to indicate an incoming
                message, POLLED_FLAG to indicate a message resulted from a
                poll. Otherwise, the message is assumed to be generated by the
                logic in the suite server program.
            submit_num (int):
                The submit number of the task relevant for the message.
                If not specified, use latest submit number.

        Return:
            None: in normal circumstances.
            True: if polling is required to confirm a reversal of status.

        """
        # Log incoming messages
        if event_time is None:
            event_time = get_current_time_string()
        if submit_num is None:
            submit_num = itask.submit_num
        if flag == self.INCOMING_FLAG and submit_num != itask.submit_num:
            flag = self.IGNORED_INCOMING_FLAG
        log_message = '(current:%s)%s %s at %s' % (
            itask.state.status, flag, message, event_time)
        LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask)
        if flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning(
                'submit-num=%d: ignore message from job submit-num=%d' % (
                    itask.submit_num, submit_num),
                itask=itask)
            return

        # always update the suite state summary for latest message
        itask.summary['latest_message'] = message
        if flag == self.POLLED_FLAG:
            itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG
        cylc.flags.iflag = True

        # Satisfy my output, if possible, and record the result.
        an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_RUNNING)):
                return True
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)):
                return True
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMITTED)):
                return True
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": signal})
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            self._reset_job_timers(itask)
        elif an_output_was_satisfied:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug(
                '(current:%s) unhandled: %s' % (itask.state.status, message),
                itask=itask)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(
                itask, ("message %s" % str(severity).lower()), message)
        if severity in ['WARNING', 'CRITICAL', 'CUSTOM']:
            self.setup_event_handlers(itask, severity.lower(), message)
예제 #47
0
 def terminate(self):
     """Kill all worker processes immediately."""
     if not self.is_dead():
         LOG.debug("Terminating process pool")
         self.pool.terminate()
예제 #48
0
파일: mp_pool.py 프로젝트: m214089/cylc
 def join(self):
     """Join after workers have exited. Close or terminate first."""
     LOG.debug("Joining process pool")
     self.pool.join()
예제 #49
0
    def process_message(self, itask, severity, message, poll_func,
                        poll_event_time=None, incoming_event_time=None,
                        submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        """

        # Log incoming messages with '>' to distinguish non-message log entries
        if incoming_event_time:
            if submit_num is None or submit_num == itask.submit_num:
                message_flag = self.INCOMING_FLAG
            else:
                message_flag = self.IGNORED_INCOMING_FLAG
            event_time = incoming_event_time
        elif poll_event_time:
            message_flag = self.POLLED_FLAG
            event_time = poll_event_time
        else:
            message_flag = ''
            event_time = get_current_time_string()
        log_message = '(current:%s)%s %s at %s' % (
            itask.state.status, message_flag, message, event_time)
        LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask)
        if message_flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning(
                'submit-num=%d: ignore message from job submit-num=%d' % (
                    itask.submit_num, submit_num),
                itask=itask)
            return

        # always update the suite state summary for latest message
        itask.summary['latest_message'] = message
        if poll_event_time is not None:
            itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG
        cylc.flags.iflag = True

        # Satisfy my output, if possible, and record the result.
        an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func):
                return
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func):
                return
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if self._poll_to_confirm(itask,
                                     TASK_STATUS_SUBMIT_FAILED, poll_func):
                return
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func):
                return
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": signal})
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            try:
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = (
                    itask.summary['submitted_time'] +
                    float(self._get_events_conf(itask, 'submission timeout')))
            except (TypeError, ValueError):
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = None
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
        elif an_output_was_satisfied:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug(
                '(current:%s) unhandled: %s' % (itask.state.status, message),
                itask=itask)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(
                itask, ("message %s" % str(severity).lower()), message)
        if severity in ['WARNING', 'CRITICAL', 'CUSTOM']:
            self.setup_event_handlers(itask, severity.lower(), message)
예제 #50
0
class TaskJobManager(object):
    """Manage task job submit, poll and kill.

    This class provides logic to:
    * Submit task jobs.
    * Poll task jobs.
    * Kill task jobs.
    * Set up the directory structure on job hosts.
    * Install suite communicate client files on job hosts.
    * Remove suite contact files on job hosts.
    """

    JOB_FILE_BASE = BatchSysManager.JOB_FILE_BASE
    JOBS_KILL = "jobs-kill"
    JOBS_POLL = "jobs-poll"
    JOBS_SUBMIT = SuiteProcPool.JOBS_SUBMIT
    KEY_EXECUTE_TIME_LIMIT = "execution_time_limit"

    def __init__(self, suite, proc_pool, suite_db_mgr, suite_srv_files_mgr):
        self.suite = suite
        self.proc_pool = proc_pool
        self.suite_db_mgr = suite_db_mgr
        self.task_events_mgr = TaskEventsManager(
            suite, proc_pool, suite_db_mgr)
        self.task_events_mgr.job_poll = self.poll_task_jobs
        self.job_file_writer = JobFileWriter()
        self.suite_srv_files_mgr = suite_srv_files_mgr
        self.init_host_map = {}  # {(user, host): should_unlink, ...}
        self.single_task_mode = False

    def check_task_jobs(self, suite, task_pool):
        """Check submission and execution timeout and polling timers.

        Poll tasks that have timed out and/or have reached next polling time.
        """
        now = time()
        poll_tasks = set()
        for itask in task_pool.get_tasks():
            if (self._check_timeout(itask, now) or
                    self.task_events_mgr.set_poll_time(itask, now)):
                poll_tasks.add(itask)
        if poll_tasks:
            self.poll_task_jobs(suite, poll_tasks)

    def init_host(self, reg, host, owner):
        """Initialise suite run dir on a user@host.

        Create SUITE_RUN_DIR/log/job/ if necessary.
        Install suite contact environment file.
        Install suite python modules.

        Raise RemoteJobHostInitError if initialisation cannot complete.

        """
        if host is None:
            host = 'localhost'
        if (self.single_task_mode or
                (host, owner) in self.init_host_map or
                not is_remote(host, owner)):
            return
        user_at_host = host
        if owner:
            user_at_host = owner + '@' + host

        r_suite_run_dir = GLOBAL_CFG.get_derived_host_item(
            reg, 'suite run directory', host, owner)
        r_log_job_dir = GLOBAL_CFG.get_derived_host_item(
            reg, 'suite job log directory', host, owner)
        r_suite_srv_dir = os.path.join(
            r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV)

        # Create a UUID file in the service directory.
        # If remote host has the file in its service directory, we can assume
        # that the remote host has a shared file system with the suite host.
        ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner)
        uuid_str = str(uuid4())
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str)
        try:
            open(uuid_fname, 'wb').close()
            proc = Popen(
                shlex.split(ssh_tmpl) + [
                    '-n', user_at_host,
                    'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)],
                stdout=PIPE, stderr=PIPE)
            if proc.wait() == 0:
                # Initialised, but no need to tidy up
                self.init_host_map[(host, owner)] = False
                return
        finally:
            try:
                os.unlink(uuid_fname)
            except OSError:
                pass

        cmds = []
        # Command to create suite directory structure on remote host.
        cmds.append(shlex.split(ssh_tmpl) + [
            '-n', user_at_host,
            'mkdir', '-p',
            r_suite_run_dir, r_log_job_dir, r_suite_srv_dir])
        # Command to copy contact and authentication files to remote host.
        # Note: no need to do this if task communication method is "poll".
        should_unlink = GLOBAL_CFG.get_host_item(
            'task communication method', host, owner) != "poll"
        if should_unlink:
            scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner)
            # Handle not having SSL certs installed.
            try:
                ssl_cert = self.suite_srv_files_mgr.get_auth_item(
                    self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg)
            except (SuiteServiceFileError, ValueError):
                ssl_cert = None
            cmds.append(shlex.split(scp_tmpl) + [
                '-p',
                self.suite_srv_files_mgr.get_contact_file(reg),
                self.suite_srv_files_mgr.get_auth_item(
                    self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg),
                user_at_host + ':' + r_suite_srv_dir + '/'])
            if ssl_cert is not None:
                cmds[-1].insert(-1, ssl_cert)
        # Command to copy python library to remote host.
        suite_run_py = os.path.join(
            GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'),
            'python')
        if os.path.isdir(suite_run_py):
            cmds.append(shlex.split(scp_tmpl) + [
                '-pr',
                suite_run_py, user_at_host + ':' + r_suite_run_dir + '/'])
        # Run commands in sequence.
        for cmd in cmds:
            proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
            out, err = proc.communicate()
            if proc.wait():
                raise RemoteJobHostInitError(
                    RemoteJobHostInitError.MSG_INIT,
                    user_at_host, ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err)
        self.init_host_map[(host, owner)] = should_unlink
        LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir))

    def kill_task_jobs(self, suite, itasks):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        active_itasks = []
        for itask in itasks:
            if itask.state.status in TASK_STATUSES_ACTIVE:
                itask.state.set_held()
                active_itasks.append(itask)
            else:
                LOG.warning('skipping %s: task not killable' % itask.identity)
        self._run_job_cmd(
            self.JOBS_KILL, suite, active_itasks,
            self._kill_task_jobs_callback)

    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        """
        poll_me = []
        pollable = [TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING,
                    TASK_STATUS_FAILED]
        for itask in itasks:
            if itask.state.status in pollable or (
                    itask.state.status == TASK_STATUS_SUCCEEDED and poll_succ):
                poll_me.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if poll_me:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, poll_me, self._poll_task_jobs_callback)

    def prep_submit_task_jobs(self, suite, itasks, dry_run=False):
        """Prepare task jobs for submit."""
        if not itasks:
            return
        prepared_tasks = []
        for itask in itasks:
            if self._prep_submit_task_job(suite, itask, dry_run) is not None:
                prepared_tasks.append(itask)
        return prepared_tasks

    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs."""
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks = self.prep_submit_task_jobs(suite, itasks)
        if not prepared_tasks:
            return

        # Submit task jobs
        auth_itasks = {}
        for itask in prepared_tasks:
            # The job file is now (about to be) used: reset the file write flag
            # so that subsequent manual retrigger will generate a new job file.
            itask.local_job_file_path = None
            itask.state.reset_state(TASK_STATUS_READY)
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for auth, itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", self.JOBS_SUBMIT]
            if cylc.flags.debug:
                cmd.append("--debug")
            host, owner = auth
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append("--")
            cmd.append(GLOBAL_CFG.get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            stdin_file_paths = []
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                if remote_mode:
                    stdin_file_paths.append(
                        self.task_events_mgr.get_task_job_log(
                            suite, itask.point, itask.tdef.name,
                            itask.submit_num, self.JOB_FILE_BASE))
                job_log_dirs.append(self.task_events_mgr.get_task_job_id(
                    itask.point, itask.tdef.name, itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SuiteProcContext(
                    self.JOBS_SUBMIT,
                    cmd,
                    stdin_file_paths=stdin_file_paths,
                    job_log_dirs=job_log_dirs,
                    **kwargs
                ),
                self._submit_task_jobs_callback, [suite, itasks])

    def unlink_hosts_contacts(self, reg):
        """Remove suite contact files from initialised hosts.

        This is called on shutdown, so we don't want anything to hang.
        Terminate any incomplete SSH commands after 10 seconds.
        """
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), should_unlink in self.init_host_map.items():
            if not should_unlink:
                continue
            user_at_host = host
            if owner:
                user_at_host = owner + '@' + host
            ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner)
            r_suite_contact_file = os.path.join(
                GLOBAL_CFG.get_derived_host_item(
                    reg, 'suite run directory', host, owner),
                self.suite_srv_files_mgr.DIR_BASE_SRV,
                self.suite_srv_files_mgr.FILE_BASE_CONTACT)
            cmd = shlex.split(ssh_tmpl) + [
                '-n', user_at_host, 'rm', '-f', r_suite_contact_file]
            procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for user_at_host, (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[user_at_host]
                out, err = proc.communicate()
                if proc.wait():
                    LOG.warning(RemoteJobHostInitError(
                        RemoteJobHostInitError.MSG_TIDY,
                        user_at_host, ' '.join(quote(item) for item in cmd),
                        proc.returncode, out, err))
        # Terminate any remaining commands
        for user_at_host, (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(RemoteJobHostInitError(
                    RemoteJobHostInitError.MSG_TIDY,
                    user_at_host, ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err))

    def _check_timeout(self, itask, now):
        """Check/handle submission/execution timeouts."""
        if itask.state.status == TASK_STATUS_RUNNING:
            timer = itask.poll_timers.get(self.KEY_EXECUTE_TIME_LIMIT)
            if timer is not None:
                if not timer.is_timeout_set():
                    timer.next()
                if not timer.is_delay_done():
                    # Don't poll
                    return False
                if timer.next() is not None:
                    # Poll now, and more retries lined up
                    return True
                # No more retry lined up, can issue execution timeout event
        if itask.state.status in itask.timeout_timers:
            timeout = itask.timeout_timers[itask.state.status]
            if timeout is None or now <= timeout:
                return False
            itask.timeout_timers[itask.state.status] = None
            if (itask.state.status == TASK_STATUS_RUNNING and
                    itask.summary['started_time'] is not None):
                msg = 'job started %s ago, but has not finished' % (
                    get_seconds_as_interval_string(
                        timeout - itask.summary['started_time']))
                event = 'execution timeout'
            elif (itask.state.status == TASK_STATUS_SUBMITTED and
                    itask.summary['submitted_time'] is not None):
                msg = 'job submitted %s ago, but has not started' % (
                    get_seconds_as_interval_string(
                        timeout - itask.summary['submitted_time']))
                event = 'submission timeout'
            else:
                return False
            LOG.warning(msg, itask=itask)
            self.task_events_mgr.setup_event_handlers(itask, event, msg)
            return True

    def _create_job_log_path(self, suite, itask):
        """Create job log directory for a task job, etc.

        Create local job directory, and NN symbolic link.
        If NN => 01, remove numbered directories with submit numbers greater
        than 01.
        Return a string in the form "POINT/NAME/SUBMIT_NUM".

        """
        job_file_dir = self.task_events_mgr.get_task_job_log(
            suite, itask.point, itask.tdef.name, itask.submit_num)
        task_log_dir = os.path.dirname(job_file_dir)
        if itask.submit_num == 1:
            try:
                names = os.listdir(task_log_dir)
            except OSError:
                pass
            else:
                for name in names:
                    if name not in ["01", self.task_events_mgr.NN]:
                        rmtree(
                            os.path.join(task_log_dir, name),
                            ignore_errors=True)
        else:
            rmtree(job_file_dir, ignore_errors=True)

        mkdir_p(job_file_dir)
        target = os.path.join(task_log_dir, self.task_events_mgr.NN)
        source = os.path.basename(job_file_dir)
        try:
            prev_source = os.readlink(target)
        except OSError:
            prev_source = None
        if prev_source == source:
            return
        try:
            if prev_source:
                os.unlink(target)
            os.symlink(source, target)
        except OSError as exc:
            if not exc.filename:
                exc.filename = target
            raise exc

    @staticmethod
    def _get_job_scripts(itask, rtconfig):
        """Return pre-script, script, post-script for a job."""
        script = rtconfig['script']
        pre_script = rtconfig['pre-script']
        post_script = rtconfig['post-script']
        if itask.tdef.suite_polling_cfg:
            # Automatic suite state polling script
            comstr = "cylc suite-state " + \
                     " --task=" + itask.tdef.suite_polling_cfg['task'] + \
                     " --point=" + str(itask.point) + \
                     " --status=" + itask.tdef.suite_polling_cfg['status']
            if cylc.flags.debug:
                comstr += ' --debug'
            for key, fmt in [
                    ('user', ' --%s=%s'),
                    ('host', ' --%s=%s'),
                    ('interval', ' --%s=%d'),
                    ('max-polls', ' --%s=%s'),
                    ('run-dir', ' --%s=%s'),
                    ('template', ' --%s=%s')]:
                if rtconfig['suite state polling'][key]:
                    comstr += fmt % (key, rtconfig['suite state polling'][key])
            comstr += " " + itask.tdef.suite_polling_cfg['suite']
            script = "echo " + comstr + "\n" + comstr
        return pre_script, script, post_script

    def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line):
        """Callback on job command STDOUT/STDERR."""
        if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"):
            user_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs
        elif cmd_ctx.cmd_kwargs.get("host"):
            user_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs
        elif cmd_ctx.cmd_kwargs.get("user"):
            user_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs
        else:
            user_at_host = ""
        try:
            timestamp, _, content = line.split("|")
        except ValueError:
            pass
        else:
            line = "%s %s" % (timestamp, content)
        job_activity_log = self.task_events_mgr.get_task_job_activity_log(
            suite, itask.point, itask.tdef.name)
        try:
            with open(job_activity_log, "ab") as handle:
                if not line.endswith("\n"):
                    line += "\n"
                handle.write(user_at_host + line)
        except IOError as exc:
            LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
            LOG.warning(user_at_host + line, itask=itask)

    def _kill_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when kill tasks command exits."""
        self._manip_task_jobs_callback(
            ctx,
            suite,
            itasks,
            self._kill_task_job_callback,
            {BatchSysManager.OUT_PREFIX_COMMAND: self._job_cmd_out_callback})

    def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _kill_task_jobs_callback, on one task job."""
        ctx = SuiteProcContext(self.JOBS_KILL, None)
        ctx.out = line
        try:
            ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        self.task_events_mgr.log_task_job_activity(
            ctx, suite, itask.point, itask.tdef.name)
        log_lvl = INFO
        log_msg = 'killed'
        if ctx.ret_code:  # non-zero exit status
            log_lvl = WARNING
            log_msg = 'kill failed'
            itask.state.kill_failed = True
        elif itask.state.status == TASK_STATUS_SUBMITTED:
            self.task_events_mgr.process_message(
                itask, CRITICAL, "%s at %s" % (
                    self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp),
                self.poll_task_jobs)
            cylc.flags.iflag = True
        elif itask.state.status == TASK_STATUS_RUNNING:
            self.task_events_mgr.process_message(
                itask, CRITICAL, TASK_OUTPUT_FAILED, self.poll_task_jobs)
            cylc.flags.iflag = True
        else:
            log_lvl = WARNING
            log_msg = (
                'ignoring job kill result, unexpected task state: %s' %
                itask.state.status)
        itask.summary['latest_message'] = log_msg
        LOG.log(log_lvl, "[%s] -job(%02d) %s" % (
            itask.identity, itask.submit_num, log_msg))

    @staticmethod
    def _manip_task_jobs_callback(
            ctx, suite, itasks, summary_callback, more_callbacks=None):
        """Callback when submit/poll/kill tasks command exits."""
        if ctx.ret_code:
            LOG.error(ctx)
        else:
            LOG.debug(ctx)
        tasks = {}
        # Note for "kill": It is possible for a job to trigger its trap and
        # report back to the suite back this logic is called. If so, the task
        # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
        # its output line will be ignored here.
        for itask in itasks:
            if itask.point is not None and itask.submit_num:
                submit_num = "%02d" % (itask.submit_num)
                tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
        handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)]
        if more_callbacks:
            for prefix, callback in more_callbacks.items():
                handlers.append((prefix, callback))
        out = ctx.out
        if not out:
            out = ""
            # Something is very wrong here
            # Fallback to use "job_log_dirs" list to report the problem
            job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", [])
            for job_log_dir in job_log_dirs:
                point, name, submit_num = job_log_dir.split(os.sep, 2)
                itask = tasks[(point, name, submit_num)]
                out += (BatchSysManager.OUT_PREFIX_SUMMARY +
                        "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
        for line in out.splitlines(True):
            for prefix, callback in handlers:
                if line.startswith(prefix):
                    line = line[len(prefix):].strip()
                    try:
                        path = line.split("|", 2)[1]  # timestamp, path, status
                        point, name, submit_num = path.split(os.sep, 2)
                        itask = tasks[(point, name, submit_num)]
                        callback(suite, itask, ctx, line)
                    except (KeyError, ValueError):
                        if cylc.flags.debug:
                            LOG.warning('Unhandled %s output: %s' % (
                                ctx.cmd_key, line))
                            LOG.warning(traceback.format_exc())

    def _poll_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when poll tasks command exits."""
        self._manip_task_jobs_callback(
            ctx,
            suite,
            itasks,
            self._poll_task_job_callback,
            {BatchSysManager.OUT_PREFIX_MESSAGE:
             self._poll_task_job_message_callback})

    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SuiteProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        items = line.split("|")
        # See cylc.batch_sys_manager.JobPollContext
        try:
            (
                batch_sys_exit_polled, run_status, run_signal,
                time_submit_exit, time_run, time_run_exit
            ) = items[4:10]
        except IndexError:
            itask.summary['latest_message'] = 'poll failed'
            cylc.flags.iflag = True
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        finally:
            self.task_events_mgr.log_task_job_activity(
                ctx, suite, itask.point, itask.tdef.name)
        if run_status == "1" and run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs,
                time_run_exit)
        elif run_status == "1" and batch_sys_exit_polled == "1":
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs,
                time_run_exit)
            self.task_events_mgr.process_message(
                itask, INFO, TaskMessage.FAIL_MESSAGE_PREFIX + run_signal,
                self.poll_task_jobs, time_run_exit)
        elif run_status == "1":
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, self.poll_task_jobs,
                time_run)
        elif run_status == "0":
            # The job succeeded
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUCCEEDED, self.poll_task_jobs,
                time_run_exit)
        elif time_run and batch_sys_exit_polled == "1":
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs, "")
        elif time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, self.poll_task_jobs,
                time_run)
        elif batch_sys_exit_polled == "1":
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                self.poll_task_jobs, time_submit_exit)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_STATUS_SUBMITTED, self.poll_task_jobs,
                time_submit_exit)

    def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on message of one task job."""
        ctx = SuiteProcContext(self.JOBS_POLL, None)
        ctx.out = line
        try:
            event_time, priority, message = line.split("|")[2:5]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = 0
            self.task_events_mgr.process_message(
                itask, priority, message, self.poll_task_jobs, event_time)
        self.task_events_mgr.log_task_job_activity(
            ctx, suite, itask.point, itask.tdef.name)

    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their user@host.
        Put a job command for each user@host to the multiprocess pool.

        """
        if not itasks:
            return
        auth_itasks = {}
        for itask in itasks:
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for (host, owner), itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", cmd_key]
            if cylc.flags.debug:
                cmd.append("--debug")
            if is_remote_host(host):
                cmd.append("--host=%s" % (host))
            if is_remote_user(owner):
                cmd.append("--user=%s" % (owner))
            cmd.append("--")
            cmd.append(GLOBAL_CFG.get_derived_host_item(
                suite, "suite job log directory", host, owner))
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(self.task_events_mgr.get_task_job_id(
                    itask.point, itask.tdef.name, itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SuiteProcContext(cmd_key, cmd), callback, [suite, itasks])

    @staticmethod
    def _set_retry_timers(itask, rtconfig=None):
        """Set try number and retry delays."""
        if rtconfig is None:
            rtconfig = itask.tdef.rtconfig
        try:
            no_retry = (
                rtconfig[itask.tdef.run_mode + ' mode']['disable retries'])
        except KeyError:
            no_retry = False
        if not no_retry:
            for key, cfg_key in [
                    (TASK_STATUS_SUBMIT_RETRYING, 'submission retry delays'),
                    (TASK_STATUS_RETRYING, 'execution retry delays')]:
                delays = rtconfig['job'][cfg_key]
                try:
                    itask.try_timers[key].set_delays(delays)
                except KeyError:
                    itask.try_timers[key] = TaskActionTimer(delays=delays)

    def _simulation_submit_task_jobs(self, itasks):
        """Simulation mode task jobs submission."""
        for itask in itasks:
            self._set_retry_timers(itask)
            itask.task_host = 'SIMULATION'
            itask.task_owner = 'SIMULATION'
            itask.summary['batch_sys_name'] = 'SIMULATION'
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = (
                itask.tdef.rtconfig['job']['simulated run length'])
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUBMITTED, self.poll_task_jobs)

    def _submit_task_jobs_callback(self, ctx, suite, itasks):
        """Callback when submit task jobs command exits."""
        self._manip_task_jobs_callback(
            ctx,
            suite,
            itasks,
            self._submit_task_job_callback,
            {BatchSysManager.OUT_PREFIX_COMMAND: self._job_cmd_out_callback})

    def _submit_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _submit_task_jobs_callback, on one task job."""
        ctx = SuiteProcContext(self.JOBS_SUBMIT, None)
        ctx.out = line
        items = line.split("|")
        try:
            ctx.timestamp, _, ctx.ret_code = items[0:3]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        self.task_events_mgr.log_task_job_activity(
            ctx, suite, itask.point, itask.tdef.name)

        if ctx.ret_code == SuiteProcPool.JOB_SKIPPED_FLAG:
            return

        try:
            itask.summary['submit_method_id'] = items[3]
        except IndexError:
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] == "None":
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] and ctx.ret_code == 0:
            self.task_events_mgr.process_message(
                itask, INFO, '%s at %s' % (
                    TASK_OUTPUT_SUBMITTED, ctx.timestamp),
                self.poll_task_jobs)
        else:
            self.task_events_mgr.process_message(
                itask, CRITICAL, '%s at %s' % (
                    self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp),
                self.poll_task_jobs)

    def _prep_submit_task_job(self, suite, itask, dry_run):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask)
            local_job_file_path = self.task_events_mgr.get_task_job_log(
                suite, itask.point, itask.tdef.name, itask.submit_num,
                self.JOB_FILE_BASE)
            self.job_file_writer.write(local_job_file_path, job_conf)
        except Exception, exc:
            # Could be a bad command template.
            LOG.error(traceback.format_exc())
            self.task_events_mgr.log_task_job_activity(
                SuiteProcContext(
                    self.JOBS_SUBMIT,
                    '(prepare job file)', err=exc, ret_code=1),
                suite, itask.point, itask.tdef.name)
            if not dry_run:
                self.task_events_mgr.process_message(
                    itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                    self.poll_task_jobs)
            return
        itask.local_job_file_path = local_job_file_path

        if dry_run:
            # This will be shown next to submit num in gcylc:
            itask.summary['latest_message'] = 'job file written (edit/dry-run)'
            LOG.debug("[%s] -%s" % (
                itask.identity, itask.summary['latest_message']))

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask
예제 #51
0
    def _prep_submit_task_job_impl(self, suite, itask):
        """Helper for self._prep_submit_task_job."""
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides)
        else:
            rtconfig = itask.tdef.rtconfig

        # Retry delays, needed for the try_num
        self._set_retry_timers(itask, rtconfig)

        # Submit number and try number
        LOG.debug("[%s] -incrementing submit number" % (itask.identity,))
        itask.submit_num += 1
        itask.summary['submit_num'] = itask.submit_num
        itask.local_job_file_path = None
        self.suite_db_mgr.put_insert_task_jobs(itask, {
            "is_manual_submit": itask.is_manual_submit,
            "try_num": itask.get_try_num(),
            "time_submit": get_current_time_string(),
        })

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(expandvars(name))

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.

        # host may be None (= run task on suite host)
        itask.task_host = get_task_host(rtconfig['remote']['host'])
        if not itask.task_host:
            itask.task_host = 'localhost'
        elif itask.task_host != "localhost":
            LOG.info("[%s] -Task host: %s" % (
                itask.identity, itask.task_host))

        itask.task_owner = rtconfig['remote']['owner']

        if itask.task_owner:
            user_at_host = itask.task_owner + "@" + itask.task_host
        else:
            user_at_host = itask.task_host
        itask.summary['host'] = user_at_host
        itask.summary['job_hosts'][itask.submit_num] = user_at_host
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[rtconfig['job']['batch system']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass
        if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
            # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10
            # minutes after time limit exceeded
            itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (
                TaskActionTimer(delays=batch_sys_conf.get(
                    'execution time limit polling intervals', [60, 120, 420])))
        for label, key in [
                ('submission polling intervals', TASK_STATUS_SUBMITTED),
                ('execution polling intervals', TASK_STATUS_RUNNING)]:
            if key in itask.poll_timers:
                itask.poll_timers[key].reset()
            else:
                values = self.task_events_mgr.get_host_conf(
                    itask, label, skey='job')
                if values:
                    itask.poll_timers[key] = TaskActionTimer(delays=values)

        self.init_host(suite, itask.task_host, itask.task_owner)
        if itask.state.outputs.has_custom_triggers():
            self.suite_db_mgr.put_update_task_outputs(itask)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "user_at_host": user_at_host,
            "batch_sys_name": itask.summary['batch_sys_name'],
        })
        itask.is_manual_submit = False

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = self.task_events_mgr.get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = os.path.join(
            GLOBAL_CFG.get_derived_host_item(
                suite, "suite job log directory",
                itask.task_host, itask.task_owner),
            job_d, self.JOB_FILE_BASE)
        return {
            'batch_system_name': rtconfig['job']['batch system'],
            'batch_submit_command_template': (
                rtconfig['job']['batch submit command template']),
            'batch_system_conf': batch_sys_conf,
            'directives': rtconfig['directives'],
            'environment': rtconfig['environment'],
            'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script': rtconfig['env-script'],
            'err-script': rtconfig['err-script'],
            'host': itask.task_host,
            'init-script': rtconfig['init-script'],
            'job_file_path': job_file_path,
            'job_d': job_d,
            'namespace_hierarchy': itask.tdef.namespace_hierarchy,
            'owner': itask.task_owner,
            'param_var': itask.tdef.param_var,
            'post-script': scripts[2],
            'pre-script': scripts[0],
            'remote_suite_d': rtconfig['remote']['suite definition directory'],
            'script': scripts[1],
            'shell': rtconfig['job']['shell'],
            'submit_num': itask.submit_num,
            'suite_name': suite,
            'task_id': itask.identity,
            'try_num': itask.get_try_num(),
            'work_d': rtconfig['work sub-directory'],
        }
예제 #52
0
def cylc_kafka_consumer(kafka_server, kafka_topic, group_id, message, debug):
    """Look for a matching message in a Kafka topic.

    ARGUMENTS:
     * kafka_server - Kafka server URL, e.g. "localhost:9092".
     * kafka_topic - the Kafka topic to check, e.g. "data-avail".
     * group_id - determines Kafka offset ownership (see below).
     * message - string-ified dict with optional pattern elements (see below).
     * debug - boolean; set by daemon debug mode; prints to suite err log.

    The topic is first consumed from the beginning, then from the previous
    committed offset. If the message is not found by end of topic, commit the
    offset and return (to will try again later). If found, return the result.

    Kafka commits offsets per "consumer group" so the group_id argument
    must be unique per distinct trigger in the suite - this allows each trigger
    to separately consume the topic from the beginning, looking for its own
    messages (otherwise, with shared offsets, one trigger could move the offset
    beyond the messages of another trigger). This goes for successive instances
    of an external-triggered cycling task too, because out-of-order triggering
    could be required sometimes. So this argument should typically be, e.g.:

        group_id=x%(id)s  # id ID of the dependent task

    where "x" is an arbitrary string you can use to change the group name if
    you need to re-run the suite, and the messages, from the start again,
    without re-running the producer suite. Note this also serves to make the
    function signature cycle-point-specific for Cylc even if the message does
    not contain the cycle point (although it probably should).

    The "message" argument is a stringified dict, e.g.:
        {'system': 'prod', 'point': '2025', 'data': '<nwp.*\.nc>'}
    should be represented as:
        "system:prod point:2025 data:<nwp.*\.nc>"

    A match occurs Kafka if all message dict items match, and the result returned
    is the sub-dict of the actual values of items containing
    angle-bracket-delineated regex patterns. E.g. above {'data': 'nwp-2025.nc'}.

    """

    consumer = KafkaConsumer(kafka_topic,
                             bootstrap_servers=[kafka_server],
                             value_deserializer=json.loads,
                             consumer_timeout_ms=CONSUMER_TIMEOUT_MS,
                             auto_offset_reset='earliest',
                             group_id=group_id)

    # Construct a dict from the message argument "key1=val1 key2=val2 ...".
    cylc_msg = dict(m.split(':') for m in message.split())

    result = (False, {})
    n_cons = 0
    for kafka_msg in consumer:
        n_cons += 1
        m = _match_msg(cylc_msg, kafka_msg)
        if m:
            result = (True, m)
            break
        # (else consume and compare next message)
    consumer.commit()
    # Unsubscribe before exit, otherwise next call will be slow while
    # Kafka times out waiting for this original consumer connection.
    consumer.unsubscribe()
    if debug:
        if result[0]:
            res = "\n  MATCHED: %s" % result[1]
        else:
            res = "no match."
        LOG.debug('Kafka: "%s" (consumed %d) ... %s' % (message, n_cons, res))
    return result
예제 #53
0
    def process_message(self,
                        itask,
                        severity,
                        message,
                        event_time=None,
                        flag='',
                        submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        Arguments:
            itask (cylc.task_proxy.TaskProxy):
                The task proxy object relevant for the message.
            severity (str or int):
                Message severity, should be a recognised logging level.
            message (str):
                Message content.
            event_time (str):
                Event time stamp. Expect ISO8601 date time string.
                If not specified, use current time.
            flag (str):
                If specified, can be INCOMING_FLAG to indicate an incoming
                message, POLLED_FLAG to indicate a message resulted from a
                poll. Otherwise, the message is assumed to be generated by the
                logic in the suite server program.
            submit_num (int):
                The submit number of the task relevant for the message.
                If not specified, use latest submit number.

        Return:
            None: in normal circumstances.
            True: if polling is required to confirm a reversal of status.

        """
        # Log incoming messages
        if event_time is None:
            event_time = get_current_time_string()
        if submit_num is None:
            submit_num = itask.submit_num
        if flag == self.INCOMING_FLAG and submit_num != itask.submit_num:
            flag = self.IGNORED_INCOMING_FLAG
        log_message = '(current:%s)%s %s at %s' % (itask.state.status, flag,
                                                   message, event_time)
        LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask)
        if flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning(
                'submit-num=%d: ignore message from job submit-num=%d' %
                (itask.submit_num, submit_num),
                itask=itask)
            return

        # always update the suite state summary for latest message
        itask.summary['latest_message'] = message
        if flag == self.POLLED_FLAG:
            itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG
        cylc.flags.iflag = True

        # Satisfy my output, if possible, and record the result.
        an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_RUNNING)):
                return True
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)):
                return True
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMITTED)):
                return True
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(itask,
                                                   {"run_signal": signal})
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            self._reset_job_timers(itask)
        elif an_output_was_satisfied:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug('(current:%s) unhandled: %s' %
                      (itask.state.status, message),
                      itask=itask)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(itask,
                                   ("message %s" % str(severity).lower()),
                                   message)
        if severity in ['WARNING', 'CRITICAL', 'CUSTOM']:
            self.setup_event_handlers(itask, severity.lower(), message)
예제 #54
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": HTTP(S) and SSH+HTTP(S) task comm
            ".service/passphrase": HTTP(S) task comm
            ".service/ssl.cert": HTTPS task comm
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item('task communication method', host,
                                             owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create "stdin_file_paths" file, with "items" in it.
        tmphandle = NamedTemporaryFile()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(self.uuid_str)
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(self.uuid_str)
        cmd.append(glbl_cfg().get_derived_host_item(self.suite,
                                                    'suite run directory',
                                                    host, owner))
        self.proc_pool.put_command(
            SuiteProcContext('remote-init',
                             cmd,
                             stdin_file_paths=[tmphandle.name]),
            self._remote_init_callback, [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
예제 #55
0
    def _setup_custom_event_handlers(self, itask, event, message):
        """Set up custom task event handlers."""
        handlers = self._get_events_conf(itask, event + ' handler')
        if (handlers is None and event in self._get_events_conf(
                itask, 'handler events', [])):
            handlers = self._get_events_conf(itask, 'handlers')
        if handlers is None:
            return
        retry_delays = self._get_events_conf(
            itask, 'handler retry delays',
            self.get_host_conf(itask, "task event handler retry delays"))
        if not retry_delays:
            retry_delays = [0]
        # There can be multiple custom event handlers
        for i, handler in enumerate(handlers):
            key1 = ("%s-%02d" % (self.HANDLER_CUSTOM, i), event)
            id_key = (key1, str(itask.point), itask.tdef.name,
                      itask.submit_num)
            if id_key in self.event_timers:
                continue
            # Note: user@host may not always be set for a submit number, e.g.
            # on late event or if host select command fails. Use null string to
            # prevent issues in this case.
            user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '')
            if user_at_host and '@' not in user_at_host:
                # (only has 'user@' on the front if user is not suite owner).
                user_at_host = '%s@%s' % (get_user(), user_at_host)
            # Custom event handler can be a command template string
            # or a command that takes 4 arguments (classic interface)
            # Note quote() fails on None, need str(None).
            try:
                handler_data = {
                    "event":
                    quote(event),
                    "suite":
                    quote(self.suite),
                    "point":
                    quote(str(itask.point)),
                    "name":
                    quote(itask.tdef.name),
                    "submit_num":
                    itask.submit_num,
                    "id":
                    quote(itask.identity),
                    "message":
                    quote(message),
                    "batch_sys_name":
                    quote(str(itask.summary['batch_sys_name'])),
                    "batch_sys_job_id":
                    quote(str(itask.summary['submit_method_id'])),
                    "submit_time":
                    quote(str(itask.summary['submitted_time_string'])),
                    "start_time":
                    quote(str(itask.summary['started_time_string'])),
                    "finish_time":
                    quote(str(itask.summary['finished_time_string'])),
                    "user@host":
                    quote(user_at_host)
                }

                if self.suite_cfg:
                    for key, value in self.suite_cfg.items():
                        if key == "URL":
                            handler_data["suite_url"] = quote(value)
                        else:
                            handler_data["suite_" + key] = quote(value)

                if itask.tdef.rtconfig['meta']:
                    for key, value in itask.tdef.rtconfig['meta'].items():
                        if key == "URL":
                            handler_data["task_url"] = quote(value)
                        handler_data[key] = quote(value)

                cmd = handler % (handler_data)
            except KeyError as exc:
                message = "%s/%s/%02d %s bad template: %s" % (
                    itask.point, itask.tdef.name, itask.submit_num, key1, exc)
                LOG.error(message)
                continue

            if cmd == handler:
                # Nothing substituted, assume classic interface
                cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite,
                                                  itask.identity, message)
            LOG.debug("Queueing %s handler: %s" % (event, cmd), itask=itask)
            self.event_timers[id_key] = (TaskActionTimer(
                CustomTaskEventHandlerContext(
                    key1,
                    self.HANDLER_CUSTOM,
                    cmd,
                ), retry_delays))
예제 #56
0
    def process_events(self, schd_ctx):
        """Process task events that were created by "setup_event_handlers".

        schd_ctx is an instance of "Schduler" in "cylc.scheduler".
        """
        ctx_groups = {}
        now = time()
        for id_key, timer in self.event_timers.copy().items():
            key1, point, name, submit_num = id_key
            if timer.is_waiting:
                continue
            # Set timer if timeout is None.
            if not timer.is_timeout_set():
                if timer.next() is None:
                    LOG.warning("%s/%s/%02d %s failed" % (
                        point, name, submit_num, key1))
                    del self.event_timers[id_key]
                    continue
                # Report retries and delayed 1st try
                tmpl = None
                if timer.num > 1:
                    tmpl = "%s/%s/%02d %s failed, retrying in %s"
                elif timer.delay:
                    tmpl = "%s/%s/%02d %s will run after %s"
                if tmpl:
                    LOG.debug(tmpl % (
                        point, name, submit_num, key1,
                        timer.delay_timeout_as_str()))
            # Ready to run?
            if not timer.is_delay_done() or (
                # Avoid flooding user's mail box with mail notification.
                # Group together as many notifications as possible within a
                # given interval.
                timer.ctx.ctx_type == self.HANDLER_MAIL and
                not schd_ctx.stop_mode and
                self.next_mail_time is not None and
                self.next_mail_time > now
            ):
                continue

            timer.set_waiting()
            if timer.ctx.ctx_type == self.HANDLER_CUSTOM:
                # Run custom event handlers on their own
                self.proc_pool.put_command(
                    SuiteProcContext(
                        (key1, submit_num),
                        timer.ctx.cmd, env=os.environ, shell=True,
                    ),
                    self._custom_handler_callback, [schd_ctx, id_key])
            else:
                # Group together built-in event handlers, where possible
                if timer.ctx not in ctx_groups:
                    ctx_groups[timer.ctx] = []
                ctx_groups[timer.ctx].append(id_key)

        next_mail_time = now + self.mail_interval
        for ctx, id_keys in ctx_groups.items():
            if ctx.ctx_type == self.HANDLER_MAIL:
                # Set next_mail_time if any mail sent
                self.next_mail_time = next_mail_time
                self._process_event_email(schd_ctx, ctx, id_keys)
            elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE:
                self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)