def report(self, request, server_obj): """Log client requests with identifying information. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info request from each client. """ if threading.current_thread().__class__.__name__ == '_MainThread': # Server methods may be called internally as well as by clients. return auth_user, prog_name, user, host, uuid, priv_level = get_client_info() name = server_obj.__class__.__name__ log_me = ( cylc.flags.debug or name in ["SuiteCommandServer", "ExtTriggerServer", "BroadcastServer"] or (name not in ["SuiteIdServer", "TaskMessageServer"] and uuid not in self.clients)) if log_me: LOG.debug( self.__class__.LOG_CONNECT_ALLOWED_TMPL % ( user, host, prog_name, priv_level, uuid) ) LOG.info( self.__class__.LOG_COMMAND_TMPL % ( request, user, host, prog_name, uuid)) if name == "SuiteIdServer": self._num_id_requests += 1 self.report_id_requests() self.clients[uuid] = datetime.datetime.utcnow() self._housekeep()
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return o_status, o_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif status in TASK_STATUSES_ACTIVE: if self.status == TASK_STATUS_HELD: self.hold_swap = TASK_STATUS_HELD elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() flags.iflag = True # Log message = str(o_status) if o_hold_swap: message += " (%s)" % o_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug(message, itask=self.identity)
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def _check_access_priv_and_report(self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL % (user, host, prog_name, priv_level, uuid)) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.__class__.LOG_COMMAND_TMPL % (command, user, host, prog_name, uuid)) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def _remote_init_items(self, host, owner): """Return list of items that should be installed on task remote. Each item is (path, name), where name is relative path under suite run directory. """ items = [] comm_meth = glbl_cfg().get_host_item('task communication method', host, owner) LOG.debug('comm_meth=%s' % comm_meth) if comm_meth in ['ssh', 'http', 'https']: # Contact file items.append( (self.suite_srv_files_mgr.get_contact_file(self.suite), os.path.join(self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT))) if comm_meth in ['http', 'https']: # Passphrase file items.append( (self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, self.suite), os.path.join( self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE))) if comm_meth in ['https']: # SSL cert file items.append( (self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, self.suite), os.path.join( self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_SSL_CERT))) return items
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ poll_me = [] pollable = [ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED ] for itask in itasks: if itask.state.status in pollable or ( itask.state.status == TASK_STATUS_SUCCEEDED and poll_succ): poll_me.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if poll_me: if msg is not None: LOG.info(msg) self._run_job_cmd(self.JOBS_POLL, suite, poll_me, self._poll_task_jobs_callback)
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity(SuiteProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Perist self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return o_status, o_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() flags.iflag = True # Log message = str(o_status) if o_hold_swap: message += " (%s)" % o_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug(message, itask=self.identity)
def __init__(self, pool_size=None): self.pool_size = (pool_size or GLOBAL_CFG.get(["process pool size"]) or multiprocessing.cpu_count()) # (The Pool class defaults to cpu_count anyway, but does not # expose the result via its public interface). LOG.debug("Initializing process pool, size %d" % self.pool_size) self.pool = multiprocessing.Pool(processes=self.pool_size) self.results = {}
def _housekeep(self): """Forget inactive clients.""" for uuid, client_info in self.clients.copy().items(): if time() - client_info['time'] > self.CLIENT_FORGET_SEC: try: del self.clients[uuid] except KeyError: pass LOG.debug(self.LOG_FORGET_TMPL % uuid)
def join(self): """Join after workers have exited. Close or terminate first.""" LOG.debug("Joining process pool") try: self.pool.join() except AssertionError: # multiprocessing.Pool.join may raise this error. We want to ignore # this so suite shutdown can continue. pass
def _prep_submit_task_job(self, suite, itask, dry_run): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask # Handle broadcasts overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = itask.tdef.rtconfig # Determine task host settings now, just before job submission, # because dynamic host selection may be used. try: task_host = self.task_remote_mgr.remote_host_select( rtconfig['remote']['host']) except TaskRemoteMgmtError as exc: # Submit number not yet incremented itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num self._prep_submit_task_job_error(suite, itask, dry_run, '(remote host select)', exc) return False else: if task_host is None: # host select not ready itask.summary['latest_message'] = self.REMOTE_SELECT_MSG return itask.task_host = task_host try: job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig) local_job_file_path = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE) self.job_file_writer.write(local_job_file_path, job_conf) except StandardError as exc: # Could be a bad command template, IOError, etc self._prep_submit_task_job_error(suite, itask, dry_run, '(prepare job file)', exc) return False itask.local_job_file_path = local_job_file_path if dry_run: # This will be shown next to submit num in gcylc: itask.summary['latest_message'] = 'job file written (edit/dry-run)' LOG.debug(itask.summary['latest_message'], itask=itask) # Return value used by "cylc submit" and "cylc jobscript": return itask
def _housekeep(self): """Forget inactive clients.""" for uuid in self.clients.keys(): dtime = self.clients[uuid] if (self._total_seconds(datetime.datetime.utcnow() - dtime) > self.__class__.CLIENT_FORGET_SEC): del self.clients[uuid] LOG.debug( self.__class__.LOG_FORGET_TMPL % uuid)
def signout(self): """Forget client, where possible.""" uuid = _get_client_info()[4] try: del self.clients[uuid] except KeyError: return False else: LOG.debug(self.LOG_FORGET_TMPL % uuid) return True
def __init__(self, pool_size=None): self.pool_size = ( pool_size or GLOBAL_CFG.get(["process pool size"]) or multiprocessing.cpu_count()) # (The Pool class defaults to cpu_count anyway, but does not # expose the result via its public interface). LOG.debug( "Initializing process pool, size %d" % self.pool_size) self.pool = multiprocessing.Pool(processes=self.pool_size) self.results = {}
def _housekeep(self): """Forget inactive clients.""" for uuid, dtime in self.clients.copy().items(): if (self._total_seconds(datetime.datetime.utcnow() - dtime) > self.__class__.CLIENT_FORGET_SEC): try: del self.clients[uuid] except KeyError: pass LOG.debug( self.__class__.LOG_FORGET_TMPL % uuid)
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def _run_command(ctx): """Execute a shell command and capture its output and exit status.""" LOG.debug(ctx) if (SuiteProcPool.STOP_JOB_SUBMISSION.value and ctx.cmd_key == SuiteProcPool.JOBS_SUBMIT): ctx.err = "job submission skipped (suite stopping)" ctx.ret_code = SuiteProcPool.JOB_SKIPPED_FLAG ctx.timestamp = get_current_time_string() return ctx try: if ctx.cmd_kwargs.get('stdin_file_paths'): if len(ctx.cmd_kwargs['stdin_file_paths']) > 1: stdin_file = TemporaryFile() for file_path in ctx.cmd_kwargs['stdin_file_paths']: stdin_file.write(open(file_path, 'rb').read()) stdin_file.seek(0) else: stdin_file = open(ctx.cmd_kwargs['stdin_file_paths'][0], 'rb') elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = PIPE else: stdin_file = open(os.devnull) proc = Popen(ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE, env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell')) except IOError as exc: if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) except OSError as exc: if exc.filename is None: exc.filename = ctx.cmd[0] if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) else: ctx.out, ctx.err = proc.communicate(ctx.cmd_kwargs.get('stdin_str')) ctx.ret_code = proc.wait() ctx.timestamp = get_current_time_string() return ctx
def _manip_task_jobs_callback(ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) tasks = {} # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. for itask in itasks: if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" # Something is very wrong here # Fallback to use "job_log_dirs" list to report the problem job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", []) for job_log_dir in job_log_dirs: point, name, submit_num = job_log_dir.split(os.sep, 2) itask = tasks[(point, name, submit_num)] out += (BatchSysManager.OUT_PREFIX_SUMMARY + "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n") for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (KeyError, ValueError): if cylc.flags.debug: LOG.warning('Unhandled %s output: %s' % (ctx.cmd_key, line)) LOG.warning(traceback.format_exc())
def callback(self, ctx): """Callback for asynchronous xtrigger functions. Record satisfaction status and function results dict. """ LOG.debug(ctx) sig = ctx.get_signature() self.active.remove(sig) try: satisfied, results = json.loads(ctx.out) except ValueError: return LOG.debug('%s: returned %s' % (sig, results)) if satisfied: self.pflag = True self.sat_xtrig[sig] = results
def _manip_task_jobs_callback( ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) tasks = {} # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. for itask in itasks: if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" # Something is very wrong here # Fallback to use "job_log_dirs" list to report the problem job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", []) for job_log_dir in job_log_dirs: point, name, submit_num = job_log_dir.split(os.sep, 2) itask = tasks[(point, name, submit_num)] out += (BatchSysManager.OUT_PREFIX_SUMMARY + "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n") for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (KeyError, ValueError): if cylc.flags.debug: LOG.warning('Unhandled %s output: %s' % ( ctx.cmd_key, line)) LOG.warning(traceback.format_exc())
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True tmphandle.close() if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def log_task_job_activity(self, ctx, suite, point, name, submit_num=NN): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = self.get_task_job_activity_log( suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True tmphandle.close() if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity( SuiteProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Perist self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def _run_command(ctx): """Execute a shell command and capture its output and exit status.""" LOG.debug(ctx) if (SuiteProcPool.STOP_JOB_SUBMISSION.value and ctx.cmd_key == SuiteProcPool.JOBS_SUBMIT): ctx.err = "job submission skipped (suite stopping)" ctx.ret_code = SuiteProcPool.JOB_SKIPPED_FLAG ctx.timestamp = get_current_time_string() return ctx try: stdin_file = None if ctx.cmd_kwargs.get('stdin_file_paths'): stdin_file = TemporaryFile() for file_path in ctx.cmd_kwargs['stdin_file_paths']: for line in open(file_path): stdin_file.write(line) stdin_file.seek(0) elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = PIPE proc = Popen( ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE, env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell')) except IOError as exc: if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) except OSError as exc: if exc.filename is None: exc.filename = ctx.cmd[0] if cylc.flags.debug: traceback.print_exc() ctx.ret_code = 1 ctx.err = str(exc) else: ctx.out, ctx.err = proc.communicate(ctx.cmd_kwargs.get('stdin_str')) ctx.ret_code = proc.wait() ctx.timestamp = get_current_time_string() return ctx
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log(suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.debug(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _run_command_init(cls, ctx, callback=None, callback_args=None): """Prepare and launch shell command in ctx.""" try: if ctx.cmd_kwargs.get('stdin_file_paths'): if len(ctx.cmd_kwargs['stdin_file_paths']) > 1: stdin_file = TemporaryFile() for file_path in ctx.cmd_kwargs['stdin_file_paths']: stdin_file.write(open(file_path, 'rb').read()) stdin_file.seek(0) else: stdin_file = open(ctx.cmd_kwargs['stdin_file_paths'][0], 'rb') elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = TemporaryFile() stdin_file.write(ctx.cmd_kwargs.get('stdin_str')) stdin_file.seek(0) else: stdin_file = open(os.devnull) proc = Popen( ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE, # Execute command as a process group leader, # so we can use "os.killpg" to kill the whole group. preexec_fn=os.setpgrp, env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell')) except (IOError, OSError) as exc: if exc.filename is None: exc.filename = ctx.cmd[0] LOG.exception(exc) ctx.ret_code = 1 ctx.err = str(exc) cls._run_command_exit(ctx, callback, callback_args) return None else: LOG.debug(ctx.cmd) return proc
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log( suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.debug(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _check_access_priv_and_report( self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL % ( user, host, prog_name, priv_level, uuid)) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.__class__.LOG_COMMAND_TMPL % ( command, user, host, prog_name, uuid)) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def _run_command_init(cls, ctx, callback=None, callback_args=None): """Prepare and launch shell command in ctx.""" try: if ctx.cmd_kwargs.get('stdin_file_paths'): if len(ctx.cmd_kwargs['stdin_file_paths']) > 1: stdin_file = TemporaryFile() for file_path in ctx.cmd_kwargs['stdin_file_paths']: stdin_file.write(open(file_path, 'rb').read()) stdin_file.seek(0) else: stdin_file = open( ctx.cmd_kwargs['stdin_file_paths'][0], 'rb') elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = TemporaryFile() stdin_file.write(ctx.cmd_kwargs.get('stdin_str')) stdin_file.seek(0) else: stdin_file = open(os.devnull) proc = Popen( ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE, # Execute command as a process group leader, # so we can use "os.killpg" to kill the whole group. preexec_fn=os.setpgrp, env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell')) except (IOError, OSError) as exc: if exc.filename is None: exc.filename = ctx.cmd[0] LOG.exception(exc) ctx.ret_code = 1 ctx.err = str(exc) cls._run_command_exit(ctx, callback, callback_args) return None else: LOG.debug(ctx.cmd) return proc
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": HTTP(S) and SSH+HTTP(S) task comm ".service/passphrase": HTTP(S) task comm ".service/ssl.cert": HTTPS task comm "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item( 'task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create "stdin_file_paths" file, with "items" in it. tmphandle = NamedTemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(self.uuid_str) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(self.uuid_str) cmd.append(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SuiteProcContext( 'remote-init', cmd, stdin_file_paths=[tmphandle.name]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = BroadcastServer.get_inst().get(itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def close(self): """Close the pool to new commands.""" if not (self.is_dead() or self.is_closed()): LOG.debug("Closing process pool") self.pool.close()
def process_events(self, schd_ctx): """Process task events that were created by "setup_event_handlers". schd_ctx is an instance of "Schduler" in "cylc.scheduler". """ ctx_groups = {} now = time() for id_key, timer in self.event_timers.copy().items(): key1, point, name, submit_num = id_key if timer.is_waiting: continue # Set timer if timeout is None. if not timer.is_timeout_set(): if timer.next() is None: LOG.warning("%s/%s/%02d %s failed" % (point, name, submit_num, key1)) del self.event_timers[id_key] continue # Report retries and delayed 1st try tmpl = None if timer.num > 1: tmpl = "%s/%s/%02d %s failed, retrying in %s" elif timer.delay: tmpl = "%s/%s/%02d %s will run after %s" if tmpl: LOG.debug(tmpl % (point, name, submit_num, key1, timer.delay_timeout_as_str())) # Ready to run? if not timer.is_delay_done() or ( # Avoid flooding user's mail box with mail notification. # Group together as many notifications as possible within a # given interval. timer.ctx.ctx_type == self.HANDLER_MAIL and not schd_ctx.stop_mode and self.next_mail_time is not None and self.next_mail_time > now): continue timer.set_waiting() if timer.ctx.ctx_type == self.HANDLER_CUSTOM: # Run custom event handlers on their own self.proc_pool.put_command( SuiteProcContext( (key1, submit_num), timer.ctx.cmd, env=os.environ, shell=True, ), self._custom_handler_callback, [schd_ctx, id_key]) else: # Group together built-in event handlers, where possible if timer.ctx not in ctx_groups: ctx_groups[timer.ctx] = [] ctx_groups[timer.ctx].append(id_key) next_mail_time = now + self.mail_interval for ctx, id_keys in ctx_groups.items(): if ctx.ctx_type == self.HANDLER_MAIL: # Set next_mail_time if any mail sent self.next_mail_time = next_mail_time self._process_event_email(schd_ctx, ctx, id_keys) elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE: self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf(itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): key1 = ("%s-%02d" % (self.HANDLER_CUSTOM, i), event) id_key = ( key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote( str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote( str(itask.summary['submit_method_id'])), "submit_time": quote( str(itask.summary['submitted_time_string'])), "start_time": quote( str(itask.summary['started_time_string'])), "finish_time": quote( str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % ( handler, event, self.suite, itask.identity, message) LOG.debug("Queueing %s handler: %s" % (event, cmd), itask=itask) self.event_timers[id_key] = ( TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask # Handle broadcasts overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = itask.tdef.rtconfig # Determine task host settings now, just before job submission, # because dynamic host selection may be used. try: task_host = self.task_remote_mgr.remote_host_select( rtconfig['remote']['host']) except TaskRemoteMgmtError as exc: # Submit number not yet incremented itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.summary['job_hosts'][itask.submit_num] = '' # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) self._prep_submit_task_job_error( suite, itask, dry_run, '(remote host select)', exc) return False else: if task_host is None: # host select not ready itask.summary['latest_message'] = self.REMOTE_SELECT_MSG return itask.task_host = task_host # Submit number not yet incremented itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) try: job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig) local_job_file_path = get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num) self.job_file_writer.write(local_job_file_path, job_conf, check_syntax=check_syntax) except StandardError as exc: # Could be a bad command template, IOError, etc self._prep_submit_task_job_error( suite, itask, dry_run, '(prepare job file)', exc) return False itask.local_job_file_path = local_job_file_path if dry_run: # This will be shown next to submit num in gcylc: itask.summary['latest_message'] = 'job file written (edit/dry-run)' LOG.debug(itask.summary['latest_message'], itask=itask) # Return value used by "cylc submit" and "cylc jobscript": return itask
def terminate(self): """Kill all worker processes immediately.""" if not self.is_dead(): LOG.debug("Terminating process pool") self.pool.terminate()
def process_message(self, itask, severity, message, poll_func, poll_event_time=None, is_incoming=False): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. """ is_polled = poll_event_time is not None # Log incoming messages with '>' to distinguish non-message log entries message_flag = "" if is_incoming: message_flag = self.INCOMING_FLAG log_message = '(current:%s)%s %s' % ( itask.state.status, message_flag, message) if poll_event_time is not None: log_message += ' %s' % self.POLLED_INDICATOR LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) # Strip the "at TIME" suffix. event_time = poll_event_time if not event_time: match = self.RE_MESSAGE_TIME.match(message) if match: message, event_time = match.groups() if not event_time: event_time = get_current_time_string() # always update the suite state summary for latest message itask.summary['latest_message'] = message if is_polled: itask.summary['latest_message'] += " %s" % self.POLLED_INDICATOR cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func): return self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func): return self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMIT_FAILED, poll_func): return self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func): return self._process_message_submitted(itask, event_time) elif message.startswith(TaskMessage.FAIL_MESSAGE_PREFIX): # Task received signal. signal = message[len(TaskMessage.FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(TaskMessage.ABORT_MESSAGE_PREFIX): # Task aborted with message aborted_with = message[len(TaskMessage.ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(TaskMessage.VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_event_time('started') # reset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True try: itask.timeout_timers[TASK_STATUS_SUBMITTED] = ( itask.summary['submitted_time'] + float(self._get_events_conf(itask, 'submission timeout'))) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_SUBMITTED] = None # Believe this and change state without polling (could poll?). cylc.flags.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. cylc.flags.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current: %s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity == "CUSTOM": self.setup_event_handlers(itask, "custom", message) elif severity in [TaskMessage.WARNING, TaskMessage.CRITICAL]: self.setup_event_handlers(itask, severity.lower(), message)
def process_message( self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG log_message = '(current:%s)%s %s at %s' % ( itask.state.status, flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % ( itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if flag == self.POLLED_FLAG: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
def join(self): """Join after workers have exited. Close or terminate first.""" LOG.debug("Joining process pool") self.pool.join()
def process_message(self, itask, severity, message, poll_func, poll_event_time=None, incoming_event_time=None, submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. """ # Log incoming messages with '>' to distinguish non-message log entries if incoming_event_time: if submit_num is None or submit_num == itask.submit_num: message_flag = self.INCOMING_FLAG else: message_flag = self.IGNORED_INCOMING_FLAG event_time = incoming_event_time elif poll_event_time: message_flag = self.POLLED_FLAG event_time = poll_event_time else: message_flag = '' event_time = get_current_time_string() log_message = '(current:%s)%s %s at %s' % ( itask.state.status, message_flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if message_flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % ( itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if poll_event_time is not None: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func): return self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func): return self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMIT_FAILED, poll_func): return self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func): return self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True try: itask.timeout_timers[TASK_STATUS_SUBMITTED] = ( itask.summary['submitted_time'] + float(self._get_events_conf(itask, 'submission timeout'))) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_SUBMITTED] = None # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
class TaskJobManager(object): """Manage task job submit, poll and kill. This class provides logic to: * Submit task jobs. * Poll task jobs. * Kill task jobs. * Set up the directory structure on job hosts. * Install suite communicate client files on job hosts. * Remove suite contact files on job hosts. """ JOB_FILE_BASE = BatchSysManager.JOB_FILE_BASE JOBS_KILL = "jobs-kill" JOBS_POLL = "jobs-poll" JOBS_SUBMIT = SuiteProcPool.JOBS_SUBMIT KEY_EXECUTE_TIME_LIMIT = "execution_time_limit" def __init__(self, suite, proc_pool, suite_db_mgr, suite_srv_files_mgr): self.suite = suite self.proc_pool = proc_pool self.suite_db_mgr = suite_db_mgr self.task_events_mgr = TaskEventsManager( suite, proc_pool, suite_db_mgr) self.task_events_mgr.job_poll = self.poll_task_jobs self.job_file_writer = JobFileWriter() self.suite_srv_files_mgr = suite_srv_files_mgr self.init_host_map = {} # {(user, host): should_unlink, ...} self.single_task_mode = False def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if (self._check_timeout(itask, now) or self.task_events_mgr.set_poll_time(itask, now)): poll_tasks.add(itask) if poll_tasks: self.poll_task_jobs(suite, poll_tasks) def init_host(self, reg, host, owner): """Initialise suite run dir on a user@host. Create SUITE_RUN_DIR/log/job/ if necessary. Install suite contact environment file. Install suite python modules. Raise RemoteJobHostInitError if initialisation cannot complete. """ if host is None: host = 'localhost' if (self.single_task_mode or (host, owner) in self.init_host_map or not is_remote(host, owner)): return user_at_host = host if owner: user_at_host = owner + '@' + host r_suite_run_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner) r_log_job_dir = GLOBAL_CFG.get_derived_host_item( reg, 'suite job log directory', host, owner) r_suite_srv_dir = os.path.join( r_suite_run_dir, self.suite_srv_files_mgr.DIR_BASE_SRV) # Create a UUID file in the service directory. # If remote host has the file in its service directory, we can assume # that the remote host has a shared file system with the suite host. ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) uuid_str = str(uuid4()) uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(reg), uuid_str) try: open(uuid_fname, 'wb').close() proc = Popen( shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'test', '-e', os.path.join(r_suite_srv_dir, uuid_str)], stdout=PIPE, stderr=PIPE) if proc.wait() == 0: # Initialised, but no need to tidy up self.init_host_map[(host, owner)] = False return finally: try: os.unlink(uuid_fname) except OSError: pass cmds = [] # Command to create suite directory structure on remote host. cmds.append(shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'mkdir', '-p', r_suite_run_dir, r_log_job_dir, r_suite_srv_dir]) # Command to copy contact and authentication files to remote host. # Note: no need to do this if task communication method is "poll". should_unlink = GLOBAL_CFG.get_host_item( 'task communication method', host, owner) != "poll" if should_unlink: scp_tmpl = GLOBAL_CFG.get_host_item('scp command', host, owner) # Handle not having SSL certs installed. try: ssl_cert = self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, reg) except (SuiteServiceFileError, ValueError): ssl_cert = None cmds.append(shlex.split(scp_tmpl) + [ '-p', self.suite_srv_files_mgr.get_contact_file(reg), self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, reg), user_at_host + ':' + r_suite_srv_dir + '/']) if ssl_cert is not None: cmds[-1].insert(-1, ssl_cert) # Command to copy python library to remote host. suite_run_py = os.path.join( GLOBAL_CFG.get_derived_host_item(reg, 'suite run directory'), 'python') if os.path.isdir(suite_run_py): cmds.append(shlex.split(scp_tmpl) + [ '-pr', suite_run_py, user_at_host + ':' + r_suite_run_dir + '/']) # Run commands in sequence. for cmd in cmds: proc = Popen(cmd, stdout=PIPE, stderr=PIPE) out, err = proc.communicate() if proc.wait(): raise RemoteJobHostInitError( RemoteJobHostInitError.MSG_INIT, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err) self.init_host_map[(host, owner)] = should_unlink LOG.info('Initialised %s:%s' % (user_at_host, r_suite_run_dir)) def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ active_itasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() active_itasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, active_itasks, self._kill_task_jobs_callback) def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. """ poll_me = [] pollable = [TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED] for itask in itasks: if itask.state.status in pollable or ( itask.state.status == TASK_STATUS_SUCCEEDED and poll_succ): poll_me.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if poll_me: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, poll_me, self._poll_task_jobs_callback) def prep_submit_task_jobs(self, suite, itasks, dry_run=False): """Prepare task jobs for submit.""" if not itasks: return prepared_tasks = [] for itask in itasks: if self._prep_submit_task_job(suite, itask, dry_run) is not None: prepared_tasks.append(itask) return prepared_tasks def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs.""" if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks = self.prep_submit_task_jobs(suite, itasks) if not prepared_tasks: return # Submit task jobs auth_itasks = {} for itask in prepared_tasks: # The job file is now (about to be) used: reset the file write flag # so that subsequent manual retrigger will generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for auth, itasks in sorted(auth_itasks.items()): cmd = ["cylc", self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append("--debug") host, owner = auth remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append("--") cmd.append(GLOBAL_CFG.get_derived_host_item( suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE)) job_log_dirs.append(self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext( self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks]) def unlink_hosts_contacts(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.init_host_map.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): LOG.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) def _check_timeout(self, itask, now): """Check/handle submission/execution timeouts.""" if itask.state.status == TASK_STATUS_RUNNING: timer = itask.poll_timers.get(self.KEY_EXECUTE_TIME_LIMIT) if timer is not None: if not timer.is_timeout_set(): timer.next() if not timer.is_delay_done(): # Don't poll return False if timer.next() is not None: # Poll now, and more retries lined up return True # No more retry lined up, can issue execution timeout event if itask.state.status in itask.timeout_timers: timeout = itask.timeout_timers[itask.state.status] if timeout is None or now <= timeout: return False itask.timeout_timers[itask.state.status] = None if (itask.state.status == TASK_STATUS_RUNNING and itask.summary['started_time'] is not None): msg = 'job started %s ago, but has not finished' % ( get_seconds_as_interval_string( timeout - itask.summary['started_time'])) event = 'execution timeout' elif (itask.state.status == TASK_STATUS_SUBMITTED and itask.summary['submitted_time'] is not None): msg = 'job submitted %s ago, but has not started' % ( get_seconds_as_interval_string( timeout - itask.summary['submitted_time'])) event = 'submission timeout' else: return False LOG.warning(msg, itask=itask) self.task_events_mgr.setup_event_handlers(itask, event, msg) return True def _create_job_log_path(self, suite, itask): """Create job log directory for a task job, etc. Create local job directory, and NN symbolic link. If NN => 01, remove numbered directories with submit numbers greater than 01. Return a string in the form "POINT/NAME/SUBMIT_NUM". """ job_file_dir = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num) task_log_dir = os.path.dirname(job_file_dir) if itask.submit_num == 1: try: names = os.listdir(task_log_dir) except OSError: pass else: for name in names: if name not in ["01", self.task_events_mgr.NN]: rmtree( os.path.join(task_log_dir, name), ignore_errors=True) else: rmtree(job_file_dir, ignore_errors=True) mkdir_p(job_file_dir) target = os.path.join(task_log_dir, self.task_events_mgr.NN) source = os.path.basename(job_file_dir) try: prev_source = os.readlink(target) except OSError: prev_source = None if prev_source == source: return try: if prev_source: os.unlink(target) os.symlink(source, target) except OSError as exc: if not exc.filename: exc.filename = target raise exc @staticmethod def _get_job_scripts(itask, rtconfig): """Return pre-script, script, post-script for a job.""" script = rtconfig['script'] pre_script = rtconfig['pre-script'] post_script = rtconfig['post-script'] if itask.tdef.suite_polling_cfg: # Automatic suite state polling script comstr = "cylc suite-state " + \ " --task=" + itask.tdef.suite_polling_cfg['task'] + \ " --point=" + str(itask.point) + \ " --status=" + itask.tdef.suite_polling_cfg['status'] if cylc.flags.debug: comstr += ' --debug' for key, fmt in [ ('user', ' --%s=%s'), ('host', ' --%s=%s'), ('interval', ' --%s=%d'), ('max-polls', ' --%s=%s'), ('run-dir', ' --%s=%s'), ('template', ' --%s=%s')]: if rtconfig['suite state polling'][key]: comstr += fmt % (key, rtconfig['suite state polling'][key]) comstr += " " + itask.tdef.suite_polling_cfg['suite'] script = "echo " + comstr + "\n" + comstr return pre_script, script, post_script def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): user_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): user_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): user_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: user_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = self.task_events_mgr.get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write(user_at_host + line) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning(user_at_host + line, itask=itask) def _kill_task_jobs_callback(self, ctx, suite, itasks): """Callback when kill tasks command exits.""" self._manip_task_jobs_callback( ctx, suite, itasks, self._kill_task_job_callback, {BatchSysManager.OUT_PREFIX_COMMAND: self._job_cmd_out_callback}) def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, "%s at %s" % ( self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp), self.poll_task_jobs) cylc.flags.iflag = True elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message( itask, CRITICAL, TASK_OUTPUT_FAILED, self.poll_task_jobs) cylc.flags.iflag = True else: log_lvl = WARNING log_msg = ( 'ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.summary['latest_message'] = log_msg LOG.log(log_lvl, "[%s] -job(%02d) %s" % ( itask.identity, itask.submit_num, log_msg)) @staticmethod def _manip_task_jobs_callback( ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) tasks = {} # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. for itask in itasks: if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" # Something is very wrong here # Fallback to use "job_log_dirs" list to report the problem job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", []) for job_log_dir in job_log_dirs: point, name, submit_num = job_log_dir.split(os.sep, 2) itask = tasks[(point, name, submit_num)] out += (BatchSysManager.OUT_PREFIX_SUMMARY + "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n") for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (KeyError, ValueError): if cylc.flags.debug: LOG.warning('Unhandled %s output: %s' % ( ctx.cmd_key, line)) LOG.warning(traceback.format_exc()) def _poll_task_jobs_callback(self, ctx, suite, itasks): """Callback when poll tasks command exits.""" self._manip_task_jobs_callback( ctx, suite, itasks, self._poll_task_job_callback, {BatchSysManager.OUT_PREFIX_MESSAGE: self._poll_task_job_message_callback}) def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 items = line.split("|") # See cylc.batch_sys_manager.JobPollContext try: ( batch_sys_exit_polled, run_status, run_signal, time_submit_exit, time_run, time_run_exit ) = items[4:10] except IndexError: itask.summary['latest_message'] = 'poll failed' cylc.flags.iflag = True ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name) if run_status == "1" and run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs, time_run_exit) elif run_status == "1" and batch_sys_exit_polled == "1": # Failed by a signal, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs, time_run_exit) self.task_events_mgr.process_message( itask, INFO, TaskMessage.FAIL_MESSAGE_PREFIX + run_signal, self.poll_task_jobs, time_run_exit) elif run_status == "1": # The job has terminated, but is still managed by batch system. # Some batch system may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, self.poll_task_jobs, time_run) elif run_status == "0": # The job succeeded self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_SUCCEEDED, self.poll_task_jobs, time_run_exit) elif time_run and batch_sys_exit_polled == "1": # The job has terminated without executing the error trap self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs, "") elif time_run: # The job has started, and is still managed by batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, self.poll_task_jobs, time_run) elif batch_sys_exit_polled == "1": # The job never ran, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs, time_submit_exit) else: # The job never ran, and is in batch system self.task_events_mgr.process_message( itask, INFO, TASK_STATUS_SUBMITTED, self.poll_task_jobs, time_submit_exit) def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on message of one task job.""" ctx = SuiteProcContext(self.JOBS_POLL, None) ctx.out = line try: event_time, priority, message = line.split("|")[2:5] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = 0 self.task_events_mgr.process_message( itask, priority, message, self.poll_task_jobs, event_time) self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name) def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if cylc.flags.debug: cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext(cmd_key, cmd), callback, [suite, itasks]) @staticmethod def _set_retry_timers(itask, rtconfig=None): """Set try number and retry delays.""" if rtconfig is None: rtconfig = itask.tdef.rtconfig try: no_retry = ( rtconfig[itask.tdef.run_mode + ' mode']['disable retries']) except KeyError: no_retry = False if not no_retry: for key, cfg_key in [ (TASK_STATUS_SUBMIT_RETRYING, 'submission retry delays'), (TASK_STATUS_RETRYING, 'execution retry delays')]: delays = rtconfig['job'][cfg_key] try: itask.try_timers[key].set_delays(delays) except KeyError: itask.try_timers[key] = TaskActionTimer(delays=delays) def _simulation_submit_task_jobs(self, itasks): """Simulation mode task jobs submission.""" for itask in itasks: self._set_retry_timers(itask) itask.task_host = 'SIMULATION' itask.task_owner = 'SIMULATION' itask.summary['batch_sys_name'] = 'SIMULATION' itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = ( itask.tdef.rtconfig['job']['simulated run length']) self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_SUBMITTED, self.poll_task_jobs) def _submit_task_jobs_callback(self, ctx, suite, itasks): """Callback when submit task jobs command exits.""" self._manip_task_jobs_callback( ctx, suite, itasks, self._submit_task_job_callback, {BatchSysManager.OUT_PREFIX_COMMAND: self._job_cmd_out_callback}) def _submit_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _submit_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_SUBMIT, None) ctx.out = line items = line.split("|") try: ctx.timestamp, _, ctx.ret_code = items[0:3] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name) if ctx.ret_code == SuiteProcPool.JOB_SKIPPED_FLAG: return try: itask.summary['submit_method_id'] = items[3] except IndexError: itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] == "None": itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] and ctx.ret_code == 0: self.task_events_mgr.process_message( itask, INFO, '%s at %s' % ( TASK_OUTPUT_SUBMITTED, ctx.timestamp), self.poll_task_jobs) else: self.task_events_mgr.process_message( itask, CRITICAL, '%s at %s' % ( self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp), self.poll_task_jobs) def _prep_submit_task_job(self, suite, itask, dry_run): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask try: job_conf = self._prep_submit_task_job_impl(suite, itask) local_job_file_path = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE) self.job_file_writer.write(local_job_file_path, job_conf) except Exception, exc: # Could be a bad command template. LOG.error(traceback.format_exc()) self.task_events_mgr.log_task_job_activity( SuiteProcContext( self.JOBS_SUBMIT, '(prepare job file)', err=exc, ret_code=1), suite, itask.point, itask.tdef.name) if not dry_run: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs) return itask.local_job_file_path = local_job_file_path if dry_run: # This will be shown next to submit num in gcylc: itask.summary['latest_message'] = 'job file written (edit/dry-run)' LOG.debug("[%s] -%s" % ( itask.identity, itask.summary['latest_message'])) # Return value used by "cylc submit" and "cylc jobscript": return itask
def _prep_submit_task_job_impl(self, suite, itask): """Helper for self._prep_submit_task_job.""" overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides) else: rtconfig = itask.tdef.rtconfig # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) # Submit number and try number LOG.debug("[%s] -incrementing submit number" % (itask.identity,)) itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num itask.local_job_file_path = None self.suite_db_mgr.put_insert_task_jobs(itask, { "is_manual_submit": itask.is_manual_submit, "try_num": itask.get_try_num(), "time_submit": get_current_time_string(), }) itask.summary['batch_sys_name'] = rtconfig['job']['batch system'] for name in rtconfig['extra log files']: itask.summary['logfiles'].append(expandvars(name)) # Determine task host settings now, just before job submission, # because dynamic host selection may be used. # host may be None (= run task on suite host) itask.task_host = get_task_host(rtconfig['remote']['host']) if not itask.task_host: itask.task_host = 'localhost' elif itask.task_host != "localhost": LOG.info("[%s] -Task host: %s" % ( itask.identity, itask.task_host)) itask.task_owner = rtconfig['remote']['owner'] if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host itask.summary['host'] = user_at_host itask.summary['job_hosts'][itask.submit_num] = user_at_host try: batch_sys_conf = self.task_events_mgr.get_host_conf( itask, 'batch systems')[rtconfig['job']['batch system']] except (TypeError, KeyError): batch_sys_conf = {} try: itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float( rtconfig['job']['execution time limit']) except TypeError: pass if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10 # minutes after time limit exceeded itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = ( TaskActionTimer(delays=batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]))) for label, key in [ ('submission polling intervals', TASK_STATUS_SUBMITTED), ('execution polling intervals', TASK_STATUS_RUNNING)]: if key in itask.poll_timers: itask.poll_timers[key].reset() else: values = self.task_events_mgr.get_host_conf( itask, label, skey='job') if values: itask.poll_timers[key] = TaskActionTimer(delays=values) self.init_host(suite, itask.task_host, itask.task_owner) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "user_at_host": user_at_host, "batch_sys_name": itask.summary['batch_sys_name'], }) itask.is_manual_submit = False scripts = self._get_job_scripts(itask, rtconfig) # Location of job file, etc self._create_job_log_path(suite, itask) job_d = self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num) job_file_path = os.path.join( GLOBAL_CFG.get_derived_host_item( suite, "suite job log directory", itask.task_host, itask.task_owner), job_d, self.JOB_FILE_BASE) return { 'batch_system_name': rtconfig['job']['batch system'], 'batch_submit_command_template': ( rtconfig['job']['batch submit command template']), 'batch_system_conf': batch_sys_conf, 'directives': rtconfig['directives'], 'environment': rtconfig['environment'], 'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT], 'env-script': rtconfig['env-script'], 'err-script': rtconfig['err-script'], 'host': itask.task_host, 'init-script': rtconfig['init-script'], 'job_file_path': job_file_path, 'job_d': job_d, 'namespace_hierarchy': itask.tdef.namespace_hierarchy, 'owner': itask.task_owner, 'param_var': itask.tdef.param_var, 'post-script': scripts[2], 'pre-script': scripts[0], 'remote_suite_d': rtconfig['remote']['suite definition directory'], 'script': scripts[1], 'shell': rtconfig['job']['shell'], 'submit_num': itask.submit_num, 'suite_name': suite, 'task_id': itask.identity, 'try_num': itask.get_try_num(), 'work_d': rtconfig['work sub-directory'], }
def cylc_kafka_consumer(kafka_server, kafka_topic, group_id, message, debug): """Look for a matching message in a Kafka topic. ARGUMENTS: * kafka_server - Kafka server URL, e.g. "localhost:9092". * kafka_topic - the Kafka topic to check, e.g. "data-avail". * group_id - determines Kafka offset ownership (see below). * message - string-ified dict with optional pattern elements (see below). * debug - boolean; set by daemon debug mode; prints to suite err log. The topic is first consumed from the beginning, then from the previous committed offset. If the message is not found by end of topic, commit the offset and return (to will try again later). If found, return the result. Kafka commits offsets per "consumer group" so the group_id argument must be unique per distinct trigger in the suite - this allows each trigger to separately consume the topic from the beginning, looking for its own messages (otherwise, with shared offsets, one trigger could move the offset beyond the messages of another trigger). This goes for successive instances of an external-triggered cycling task too, because out-of-order triggering could be required sometimes. So this argument should typically be, e.g.: group_id=x%(id)s # id ID of the dependent task where "x" is an arbitrary string you can use to change the group name if you need to re-run the suite, and the messages, from the start again, without re-running the producer suite. Note this also serves to make the function signature cycle-point-specific for Cylc even if the message does not contain the cycle point (although it probably should). The "message" argument is a stringified dict, e.g.: {'system': 'prod', 'point': '2025', 'data': '<nwp.*\.nc>'} should be represented as: "system:prod point:2025 data:<nwp.*\.nc>" A match occurs Kafka if all message dict items match, and the result returned is the sub-dict of the actual values of items containing angle-bracket-delineated regex patterns. E.g. above {'data': 'nwp-2025.nc'}. """ consumer = KafkaConsumer(kafka_topic, bootstrap_servers=[kafka_server], value_deserializer=json.loads, consumer_timeout_ms=CONSUMER_TIMEOUT_MS, auto_offset_reset='earliest', group_id=group_id) # Construct a dict from the message argument "key1=val1 key2=val2 ...". cylc_msg = dict(m.split(':') for m in message.split()) result = (False, {}) n_cons = 0 for kafka_msg in consumer: n_cons += 1 m = _match_msg(cylc_msg, kafka_msg) if m: result = (True, m) break # (else consume and compare next message) consumer.commit() # Unsubscribe before exit, otherwise next call will be slow while # Kafka times out waiting for this original consumer connection. consumer.unsubscribe() if debug: if result[0]: res = "\n MATCHED: %s" % result[1] else: res = "no match." LOG.debug('Kafka: "%s" (consumed %d) ... %s' % (message, n_cons, res)) return result
def process_message(self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG log_message = '(current:%s)%s %s at %s' % (itask.state.status, flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % (itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if flag == self.POLLED_FLAG: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs(itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug('(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert(itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": HTTP(S) and SSH+HTTP(S) task comm ".service/passphrase": HTTP(S) task comm ".service/ssl.cert": HTTPS task comm "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item('task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create "stdin_file_paths" file, with "items" in it. tmphandle = NamedTemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(self.uuid_str) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(self.uuid_str) cmd.append(glbl_cfg().get_derived_host_item(self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SuiteProcContext('remote-init', cmd, stdin_file_paths=[tmphandle.name]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): key1 = ("%s-%02d" % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote(str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote(str(itask.summary['submit_method_id'])), "submit_time": quote(str(itask.summary['submitted_time_string'])), "start_time": quote(str(itask.summary['started_time_string'])), "finish_time": quote(str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("Queueing %s handler: %s" % (event, cmd), itask=itask) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def process_events(self, schd_ctx): """Process task events that were created by "setup_event_handlers". schd_ctx is an instance of "Schduler" in "cylc.scheduler". """ ctx_groups = {} now = time() for id_key, timer in self.event_timers.copy().items(): key1, point, name, submit_num = id_key if timer.is_waiting: continue # Set timer if timeout is None. if not timer.is_timeout_set(): if timer.next() is None: LOG.warning("%s/%s/%02d %s failed" % ( point, name, submit_num, key1)) del self.event_timers[id_key] continue # Report retries and delayed 1st try tmpl = None if timer.num > 1: tmpl = "%s/%s/%02d %s failed, retrying in %s" elif timer.delay: tmpl = "%s/%s/%02d %s will run after %s" if tmpl: LOG.debug(tmpl % ( point, name, submit_num, key1, timer.delay_timeout_as_str())) # Ready to run? if not timer.is_delay_done() or ( # Avoid flooding user's mail box with mail notification. # Group together as many notifications as possible within a # given interval. timer.ctx.ctx_type == self.HANDLER_MAIL and not schd_ctx.stop_mode and self.next_mail_time is not None and self.next_mail_time > now ): continue timer.set_waiting() if timer.ctx.ctx_type == self.HANDLER_CUSTOM: # Run custom event handlers on their own self.proc_pool.put_command( SuiteProcContext( (key1, submit_num), timer.ctx.cmd, env=os.environ, shell=True, ), self._custom_handler_callback, [schd_ctx, id_key]) else: # Group together built-in event handlers, where possible if timer.ctx not in ctx_groups: ctx_groups[timer.ctx] = [] ctx_groups[timer.ctx].append(id_key) next_mail_time = now + self.mail_interval for ctx, id_keys in ctx_groups.items(): if ctx.ctx_type == self.HANDLER_MAIL: # Set next_mail_time if any mail sent self.next_mail_time = next_mail_time self._process_event_email(schd_ctx, ctx, id_keys) elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE: self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)