def _setup_event_mail(self, itask, event): """Set up task event notification, by email.""" if event in self.NON_UNIQUE_EVENTS: key1 = (self.HANDLER_MAIL, '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = (self.HANDLER_MAIL, event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if (id_key in self.event_timers or event not in self._get_events_conf( itask, "mail events", [])): return retry_delays = self._get_events_conf(itask, "mail retry delays") if not retry_delays: retry_delays = [0] self.event_timers[id_key] = TaskActionTimer( TaskEventMailContext( self.HANDLER_MAIL, # key self.HANDLER_MAIL, # ctx_type self._get_events_conf( # mail_from itask, "mail from", "notifications@" + get_host(), ), self._get_events_conf(itask, "mail to", get_user()), # mail_to self._get_events_conf(itask, "mail smtp"), # mail_smtp ), retry_delays)
def _setup_job_logs_retrieval(self, itask, event): """Set up remote job logs retrieval. For a task with a job completion event, i.e. succeeded, failed, (execution) retry. """ id_key = ((self.HANDLER_JOB_LOGS_RETRIEVE, event), str(itask.point), itask.tdef.name, itask.submit_num) if itask.task_owner: user_at_host = itask.task_owner + "@" + itask.task_host else: user_at_host = itask.task_host events = (self.EVENT_FAILED, self.EVENT_RETRY, self.EVENT_SUCCEEDED) if (event not in events or user_at_host in [get_user() + '@localhost', 'localhost'] or not self.get_host_conf(itask, "retrieve job logs") or id_key in self.event_timers): return retry_delays = self.get_host_conf(itask, "retrieve job logs retry delays") if not retry_delays: retry_delays = [0] self.event_timers[id_key] = TaskActionTimer( TaskJobLogsRetrieveContext( self.HANDLER_JOB_LOGS_RETRIEVE, # key self.HANDLER_JOB_LOGS_RETRIEVE, # ctx_type user_at_host, self.get_host_conf(itask, "retrieve job logs max size"), ), retry_delays)
def _set_retry_timers(itask, rtconfig=None, retry=True): """Set try number and retry delays.""" if rtconfig is None: rtconfig = itask.tdef.rtconfig if (itask.tdef.run_mode + ' mode' in rtconfig and 'disable retries' in rtconfig[itask.tdef.run_mode + ' mode']): retry = False if retry: if rtconfig['job']['submission retry delays']: submit_delays = rtconfig['job']['submission retry delays'] else: submit_delays = itask.platform['submission retry delays'] # TODO: same for execution delays? if retry: for key, delays in [(TimerFlags.SUBMISSION_RETRY, submit_delays), (TimerFlags.EXECUTION_RETRY, rtconfig['job']['execution retry delays'])]: if delays is None: delays = [] try: itask.try_timers[key].set_delays(delays) except KeyError: itask.try_timers[key] = TaskActionTimer(delays=delays)
def _setup_job_logs_retrieval(self, itask, event): """Set up remote job logs retrieval. For a task with a job completion event, i.e. succeeded, failed, (execution) retry. """ id_key = ((self.HANDLER_JOB_LOGS_RETRIEVE, event), str(itask.point), itask.tdef.name, itask.submit_num) events = (self.EVENT_FAILED, self.EVENT_RETRY, self.EVENT_SUCCEEDED) host = get_host_from_platform(itask.platform) if (event not in events or not is_remote_host(host) or not self.get_host_conf(itask, "retrieve job logs") or id_key in self.event_timers): return retry_delays = self.get_host_conf(itask, "retrieve job logs retry delays") if not retry_delays: retry_delays = [0] self.event_timers[id_key] = TaskActionTimer( TaskJobLogsRetrieveContext( self.HANDLER_JOB_LOGS_RETRIEVE, # key self.HANDLER_JOB_LOGS_RETRIEVE, # ctx_type itask.platform['name'], self.get_host_conf(itask, "retrieve job logs max size"), ), retry_delays)
def _set_retry_timers(itask, rtconfig=None): """Set try number and retry delays.""" if rtconfig is None: rtconfig = itask.tdef.rtconfig try: no_retry = (rtconfig[itask.tdef.run_mode + ' mode']['disable retries']) except KeyError: no_retry = False if not no_retry: for key, cfg_key in [ (TASK_STATUS_SUBMIT_RETRYING, 'submission retry delays'), (TASK_STATUS_RETRYING, 'execution retry delays') ]: delays = rtconfig['job'][cfg_key] if delays is None: delays = [] try: itask.try_timers[key].set_delays(delays) except KeyError: itask.try_timers[key] = TaskActionTimer(delays=delays)
def _reset_job_timers(self, itask): """Set up poll timer and timeout for task.""" if not itask.state(*TASK_STATUSES_ACTIVE): # Reset, task not active itask.timeout = None itask.poll_timer = None return ctx = (itask.submit_num, itask.state.status) if itask.poll_timer and itask.poll_timer.ctx == ctx: return # Set poll timer # Set timeout timeref = None # reference time, submitted or started time timeout = None # timeout in setting if itask.state(TASK_STATUS_RUNNING): timeref = itask.summary['started_time'] timeout_key = 'execution timeout' timeout = self._get_events_conf(itask, timeout_key) delays = list( self.get_host_conf(itask, 'execution polling intervals', skey='job', default=[900 ])) # Default 15 minute intervals if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT] try: host_conf = self.get_host_conf(itask, 'batch systems') batch_sys_conf = host_conf[itask.summary['batch_sys_name']] except (TypeError, KeyError): batch_sys_conf = {} time_limit_delays = batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]) timeout = time_limit + sum(time_limit_delays) # Remove excessive polling before time limit while sum(delays) > time_limit: del delays[-1] # But fill up the gap before time limit if delays: size = int((time_limit - sum(delays)) / delays[-1]) delays.extend([delays[-1]] * size) time_limit_delays[0] += time_limit - sum(delays) delays += time_limit_delays else: # if itask.state.status == TASK_STATUS_SUBMITTED: timeref = itask.summary['submitted_time'] timeout_key = 'submission timeout' timeout = self._get_events_conf(itask, timeout_key) delays = list( self.get_host_conf(itask, 'submission polling intervals', skey='job', default=[900 ])) # Default 15 minute intervals try: itask.timeout = timeref + float(timeout) timeout_str = intvl_as_str(timeout) except (TypeError, ValueError): itask.timeout = None timeout_str = None itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays) # Log timeout and polling schedule message = 'health check settings: %s=%s' % (timeout_key, timeout_str) # Attempt to group identical consecutive delays as N*DELAY,... if itask.poll_timer.delays: items = [] # [(number of item - 1, item), ...] for delay in itask.poll_timer.delays: if items and items[-1][1] == delay: items[-1][0] += 1 else: items.append([0, delay]) message += ', polling intervals=' for num, item in items: if num: message += '%d*' % (num + 1) message += '%s,' % intvl_as_str(item) message += '...' LOG.info('[%s] -%s', itask, message) # Set next poll time self.check_poll_time(itask)
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), 'suite_uuid': quote(str(self.uuid_str)), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "try_num": itask.get_try_num(), "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote(str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote(str(itask.summary['submit_method_id'])), "submit_time": quote(str(itask.summary['submitted_time_string'])), "start_time": quote(str(itask.summary['started_time_string'])), "finish_time": quote(str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf(itask, 'handler retry delays') if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. platform_n = itask.summary['platforms_used'].get( itask.submit_num, '') # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { EventData.BatchSysJobID.value: quote(str(itask.summary['submit_method_id'])), EventData.BatchSysName.value: quote(str(itask.summary['batch_sys_name'])), EventData.CyclePoint.value: quote(str(itask.point)), EventData.Event.value: quote(event), EventData.FinishTime.value: quote(str(itask.summary['finished_time_string'])), EventData.ID.value: quote(itask.identity), EventData.Message.value: quote(message), EventData.TaskName.value: quote(itask.tdef.name), EventData.PlatformName.value: quote(platform_n), EventData.StartTime.value: quote(str(itask.summary['started_time_string'])), EventData.SubmitNum.value: itask.submit_num, EventData.SubmitTime.value: quote(str(itask.summary['submitted_time_string'])), EventData.Suite.value: quote(self.suite), EventData.SuiteUUID.value: quote(str(self.uuid_str)), EventData.TryNum.value: itask.get_try_num(), # task and suite metadata **get_event_handler_data(itask.tdef.rtconfig, self.suite_cfg) } cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))