Exemplo n.º 1
0
    def _setup_job_logs_retrieval(self, itask, event):
        """Set up remote job logs retrieval.

        For a task with a job completion event, i.e. succeeded, failed,
        (execution) retry.
        """
        id_key = ((self.HANDLER_JOB_LOGS_RETRIEVE, event), str(itask.point),
                  itask.tdef.name, itask.submit_num)
        if itask.task_owner:
            user_at_host = itask.task_owner + "@" + itask.task_host
        else:
            user_at_host = itask.task_host
        events = (self.EVENT_FAILED, self.EVENT_RETRY, self.EVENT_SUCCEEDED)
        if (event not in events
                or user_at_host in [get_user() + '@localhost', 'localhost']
                or not self.get_host_conf(itask, "retrieve job logs")
                or id_key in self.event_timers):
            return
        retry_delays = self.get_host_conf(itask,
                                          "retrieve job logs retry delays")
        if not retry_delays:
            retry_delays = [0]
        self.event_timers[id_key] = TaskActionTimer(
            TaskJobLogsRetrieveContext(
                self.HANDLER_JOB_LOGS_RETRIEVE,  # key
                self.HANDLER_JOB_LOGS_RETRIEVE,  # ctx_type
                user_at_host,
                self.get_host_conf(itask, "retrieve job logs max size"),
            ),
            retry_delays)
Exemplo n.º 2
0
 def _setup_event_mail(self, itask, event):
     """Set up task event notification, by email."""
     if event in self.NON_UNIQUE_EVENTS:
         key1 = (self.HANDLER_MAIL,
                 '%s-%d' % (event, itask.non_unique_events.get(event, 1)))
     else:
         key1 = (self.HANDLER_MAIL, event)
     id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num)
     if (id_key in self.event_timers or event not in self._get_events_conf(
             itask, "mail events", [])):
         return
     retry_delays = self._get_events_conf(itask, "mail retry delays")
     if not retry_delays:
         retry_delays = [0]
     self.event_timers[id_key] = TaskActionTimer(
         TaskEventMailContext(
             self.HANDLER_MAIL,  # key
             self.HANDLER_MAIL,  # ctx_type
             self._get_events_conf(  # mail_from
                 itask,
                 "mail from",
                 "notifications@" + get_host(),
             ),
             self._get_events_conf(itask, "mail to", get_user()),  # mail_to
             self._get_events_conf(itask, "mail smtp"),  # mail_smtp
         ),
         retry_delays)
Exemplo n.º 3
0
 def _set_retry_timers(itask, rtconfig=None):
     """Set try number and retry delays."""
     if rtconfig is None:
         rtconfig = itask.tdef.rtconfig
     try:
         no_retry = (
             rtconfig[itask.tdef.run_mode + ' mode']['disable retries'])
     except KeyError:
         no_retry = False
     if not no_retry:
         for key, cfg_key in [
                 (TASK_STATUS_SUBMIT_RETRYING, 'submission retry delays'),
                 (TASK_STATUS_RETRYING, 'execution retry delays')]:
             delays = rtconfig['job'][cfg_key]
             try:
                 itask.try_timers[key].set_delays(delays)
             except KeyError:
                 itask.try_timers[key] = TaskActionTimer(delays=delays)
Exemplo n.º 4
0
 def _reset_job_timers(self, itask):
     """Set up poll timer and timeout for task."""
     if itask.state.status not in TASK_STATUSES_ACTIVE:
         # Reset, task not active
         itask.timeout = None
         itask.poll_timer = None
         return
     ctx = (itask.submit_num, itask.state.status)
     if itask.poll_timer and itask.poll_timer.ctx == ctx:
         return
     # Set poll timer
     # Set timeout
     timeref = None  # reference time, submitted or started time
     timeout = None  # timeout in setting
     if itask.state.status == TASK_STATUS_RUNNING:
         timeref = itask.summary['started_time']
         timeout_key = 'execution timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = self.get_host_conf(
             itask,
             'execution polling intervals',
             skey='job',
             default=[900])  # Default 15 minute intervals
         if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
             time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT]
             try:
                 host_conf = self.get_host_conf(itask, 'batch systems')
                 batch_sys_conf = host_conf[itask.summary['batch_sys_name']]
             except (TypeError, KeyError):
                 batch_sys_conf = {}
             time_limit_delays = batch_sys_conf.get(
                 'execution time limit polling intervals', [60, 120, 420])
             timeout = time_limit + sum(time_limit_delays)
             # Remove execessive polling before time limit
             while sum(delays) > time_limit:
                 del delays[-1]
             # But fill up the gap before time limit
             if delays:
                 size = int((time_limit - sum(delays)) / delays[-1])
                 delays.extend([delays[-1]] * size)
             time_limit_delays[0] += time_limit - sum(delays)
             delays += time_limit_delays
     else:  # if itask.state.status == TASK_STATUS_SUBMITTED:
         timeref = itask.summary['submitted_time']
         timeout_key = 'submission timeout'
         timeout = self._get_events_conf(itask, timeout_key)
         delays = self.get_host_conf(
             itask,
             'submission polling intervals',
             skey='job',
             default=[900])  # Default 15 minute intervals
     try:
         itask.timeout = timeref + float(timeout)
         timeout_str = intvl_as_str(timeout)
     except (TypeError, ValueError):
         itask.timeout = None
         timeout_str = None
     itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays)
     # Log timeout and polling schedule
     message = 'health check settings: %s=%s' % (timeout_key, timeout_str)
     # Attempt to group idenitical consecutive delays as N*DELAY,...
     if itask.poll_timer.delays:
         items = []  # [(number of item - 1, item), ...]
         for delay in itask.poll_timer.delays:
             if items and items[-1][1] == delay:
                 items[-1][0] += 1
             else:
                 items.append([0, delay])
         message += ', polling intervals='
         for num, item in items:
             if num:
                 message += '%d*' % (num + 1)
             message += '%s,' % intvl_as_str(item)
         message += '...'
     LOG.info(message, itask=itask)
     # Set next poll time
     self.check_poll_time(itask)
Exemplo n.º 5
0
    def _setup_custom_event_handlers(self, itask, event, message):
        """Set up custom task event handlers."""
        handlers = self._get_events_conf(itask, event + ' handler')
        if (handlers is None and event in self._get_events_conf(
                itask, 'handler events', [])):
            handlers = self._get_events_conf(itask, 'handlers')
        if handlers is None:
            return
        retry_delays = self._get_events_conf(
            itask, 'handler retry delays',
            self.get_host_conf(itask, "task event handler retry delays"))
        if not retry_delays:
            retry_delays = [0]
        # There can be multiple custom event handlers
        for i, handler in enumerate(handlers):
            key1 = ("%s-%02d" % (self.HANDLER_CUSTOM, i), event)
            id_key = (key1, str(itask.point), itask.tdef.name,
                      itask.submit_num)
            if id_key in self.event_timers:
                continue
            # Note: user@host may not always be set for a submit number, e.g.
            # on late event or if host select command fails. Use null string to
            # prevent issues in this case.
            user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '')
            if user_at_host and '@' not in user_at_host:
                # (only has 'user@' on the front if user is not suite owner).
                user_at_host = '%s@%s' % (get_user(), user_at_host)
            # Custom event handler can be a command template string
            # or a command that takes 4 arguments (classic interface)
            # Note quote() fails on None, need str(None).
            try:
                handler_data = {
                    "event":
                    quote(event),
                    "suite":
                    quote(self.suite),
                    "point":
                    quote(str(itask.point)),
                    "name":
                    quote(itask.tdef.name),
                    "submit_num":
                    itask.submit_num,
                    "id":
                    quote(itask.identity),
                    "message":
                    quote(message),
                    "batch_sys_name":
                    quote(str(itask.summary['batch_sys_name'])),
                    "batch_sys_job_id":
                    quote(str(itask.summary['submit_method_id'])),
                    "submit_time":
                    quote(str(itask.summary['submitted_time_string'])),
                    "start_time":
                    quote(str(itask.summary['started_time_string'])),
                    "finish_time":
                    quote(str(itask.summary['finished_time_string'])),
                    "user@host":
                    quote(user_at_host)
                }

                if self.suite_cfg:
                    for key, value in self.suite_cfg.items():
                        if key == "URL":
                            handler_data["suite_url"] = quote(value)
                        else:
                            handler_data["suite_" + key] = quote(value)

                if itask.tdef.rtconfig['meta']:
                    for key, value in itask.tdef.rtconfig['meta'].items():
                        if key == "URL":
                            handler_data["task_url"] = quote(value)
                        handler_data[key] = quote(value)

                cmd = handler % (handler_data)
            except KeyError as exc:
                message = "%s/%s/%02d %s bad template: %s" % (
                    itask.point, itask.tdef.name, itask.submit_num, key1, exc)
                LOG.error(message)
                continue

            if cmd == handler:
                # Nothing substituted, assume classic interface
                cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite,
                                                  itask.identity, message)
            LOG.debug("Queueing %s handler: %s" % (event, cmd), itask=itask)
            self.event_timers[id_key] = (TaskActionTimer(
                CustomTaskEventHandlerContext(
                    key1,
                    self.HANDLER_CUSTOM,
                    cmd,
                ), retry_delays))
    def _prep_submit_task_job_impl(self, suite, itask, rtconfig):
        """Helper for self._prep_submit_task_job."""
        # Submit number
        itask.submit_num += 1
        itask.summary['submit_num'] = itask.submit_num

        itask.task_owner = rtconfig['remote']['owner']
        if itask.task_owner:
            owner_at_host = itask.task_owner + "@" + itask.task_host
        else:
            owner_at_host = itask.task_host
        itask.summary['host'] = owner_at_host
        itask.summary['job_hosts'][itask.submit_num] = owner_at_host

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(expandvars(name))
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[rtconfig['job']['batch system']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass
        if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
            # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10
            # minutes after time limit exceeded
            itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (TaskActionTimer(
                delays=batch_sys_conf.get(
                    'execution time limit polling intervals', [60, 120, 420])))
        for label, key in [
            ('submission polling intervals', TASK_STATUS_SUBMITTED),
            ('execution polling intervals', TASK_STATUS_RUNNING)
        ]:
            if key in itask.poll_timers:
                itask.poll_timers[key].reset()
            else:
                values = self.task_events_mgr.get_host_conf(itask,
                                                            label,
                                                            skey='job')
                if values:
                    itask.poll_timers[key] = TaskActionTimer(delays=values)

        scripts = self._get_job_scripts(itask, rtconfig)

        # Retry delays, needed for the try_num
        self._set_retry_timers(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = self.task_events_mgr.get_task_job_id(itask.point,
                                                     itask.tdef.name,
                                                     itask.submit_num)
        job_file_path = os.path.join(
            GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory",
                                             itask.task_host,
                                             itask.task_owner), job_d,
            self.JOB_FILE_BASE)
        return {
            'batch_system_name':
            rtconfig['job']['batch system'],
            'batch_submit_command_template':
            (rtconfig['job']['batch submit command template']),
            'batch_system_conf':
            batch_sys_conf,
            'directives':
            rtconfig['directives'],
            'environment':
            rtconfig['environment'],
            'execution_time_limit':
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script':
            rtconfig['env-script'],
            'err-script':
            rtconfig['err-script'],
            'host':
            itask.task_host,
            'init-script':
            rtconfig['init-script'],
            'job_file_path':
            job_file_path,
            'job_d':
            job_d,
            'namespace_hierarchy':
            itask.tdef.namespace_hierarchy,
            'owner':
            itask.task_owner,
            'param_env_tmpl':
            rtconfig['parameter environment templates'],
            'param_var':
            itask.tdef.param_var,
            'post-script':
            scripts[2],
            'pre-script':
            scripts[0],
            'remote_suite_d':
            rtconfig['remote']['suite definition directory'],
            'script':
            scripts[1],
            'shell':
            rtconfig['job']['shell'],
            'submit_num':
            itask.submit_num,
            'suite_name':
            suite,
            'task_id':
            itask.identity,
            'try_num':
            itask.get_try_num(),
            'work_d':
            rtconfig['work sub-directory'],
        }
Exemplo n.º 7
0
    def _prep_submit_task_job_impl(self, suite, itask):
        """Helper for self._prep_submit_task_job."""
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides)
        else:
            rtconfig = itask.tdef.rtconfig

        # Retry delays, needed for the try_num
        self._set_retry_timers(itask, rtconfig)

        # Submit number and try number
        LOG.debug("[%s] -incrementing submit number" % (itask.identity,))
        itask.submit_num += 1
        itask.summary['submit_num'] = itask.submit_num
        itask.local_job_file_path = None
        self.suite_db_mgr.put_insert_task_jobs(itask, {
            "is_manual_submit": itask.is_manual_submit,
            "try_num": itask.get_try_num(),
            "time_submit": get_current_time_string(),
        })

        itask.summary['batch_sys_name'] = rtconfig['job']['batch system']
        for name in rtconfig['extra log files']:
            itask.summary['logfiles'].append(expandvars(name))

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.

        # host may be None (= run task on suite host)
        itask.task_host = get_task_host(rtconfig['remote']['host'])
        if not itask.task_host:
            itask.task_host = 'localhost'
        elif itask.task_host != "localhost":
            LOG.info("[%s] -Task host: %s" % (
                itask.identity, itask.task_host))

        itask.task_owner = rtconfig['remote']['owner']

        if itask.task_owner:
            user_at_host = itask.task_owner + "@" + itask.task_host
        else:
            user_at_host = itask.task_host
        itask.summary['host'] = user_at_host
        itask.summary['job_hosts'][itask.submit_num] = user_at_host
        try:
            batch_sys_conf = self.task_events_mgr.get_host_conf(
                itask, 'batch systems')[rtconfig['job']['batch system']]
        except (TypeError, KeyError):
            batch_sys_conf = {}
        try:
            itask.summary[self.KEY_EXECUTE_TIME_LIMIT] = float(
                rtconfig['job']['execution time limit'])
        except TypeError:
            pass
        if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]:
            # Default = 1, 2 and 7 minutes intervals, roughly 1, 3 and 10
            # minutes after time limit exceeded
            itask.poll_timers[self.KEY_EXECUTE_TIME_LIMIT] = (
                TaskActionTimer(delays=batch_sys_conf.get(
                    'execution time limit polling intervals', [60, 120, 420])))
        for label, key in [
                ('submission polling intervals', TASK_STATUS_SUBMITTED),
                ('execution polling intervals', TASK_STATUS_RUNNING)]:
            if key in itask.poll_timers:
                itask.poll_timers[key].reset()
            else:
                values = self.task_events_mgr.get_host_conf(
                    itask, label, skey='job')
                if values:
                    itask.poll_timers[key] = TaskActionTimer(delays=values)

        self.init_host(suite, itask.task_host, itask.task_owner)
        if itask.state.outputs.has_custom_triggers():
            self.suite_db_mgr.put_update_task_outputs(itask)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "user_at_host": user_at_host,
            "batch_sys_name": itask.summary['batch_sys_name'],
        })
        itask.is_manual_submit = False

        scripts = self._get_job_scripts(itask, rtconfig)

        # Location of job file, etc
        self._create_job_log_path(suite, itask)
        job_d = self.task_events_mgr.get_task_job_id(
            itask.point, itask.tdef.name, itask.submit_num)
        job_file_path = os.path.join(
            GLOBAL_CFG.get_derived_host_item(
                suite, "suite job log directory",
                itask.task_host, itask.task_owner),
            job_d, self.JOB_FILE_BASE)
        return {
            'batch_system_name': rtconfig['job']['batch system'],
            'batch_submit_command_template': (
                rtconfig['job']['batch submit command template']),
            'batch_system_conf': batch_sys_conf,
            'directives': rtconfig['directives'],
            'environment': rtconfig['environment'],
            'execution_time_limit': itask.summary[self.KEY_EXECUTE_TIME_LIMIT],
            'env-script': rtconfig['env-script'],
            'err-script': rtconfig['err-script'],
            'host': itask.task_host,
            'init-script': rtconfig['init-script'],
            'job_file_path': job_file_path,
            'job_d': job_d,
            'namespace_hierarchy': itask.tdef.namespace_hierarchy,
            'owner': itask.task_owner,
            'param_var': itask.tdef.param_var,
            'post-script': scripts[2],
            'pre-script': scripts[0],
            'remote_suite_d': rtconfig['remote']['suite definition directory'],
            'script': scripts[1],
            'shell': rtconfig['job']['shell'],
            'submit_num': itask.submit_num,
            'suite_name': suite,
            'task_id': itask.identity,
            'try_num': itask.get_try_num(),
            'work_d': rtconfig['work sub-directory'],
        }