def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists(get_task_job_log( schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SuiteProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 items = line.split("|") # See cylc.batch_sys_manager.JobPollContext try: ( batch_sys_exit_polled, run_status, run_signal, time_submit_exit, time_run, time_run_exit ) = items[4:10] except IndexError: itask.summary['latest_message'] = 'poll failed' cylc.flags.iflag = True ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) flag = self.task_events_mgr.POLLED_FLAG if run_status == "1" and run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, time_run_exit, flag) elif run_status == "1" and batch_sys_exit_polled == "1": # Failed by a signal, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, time_run_exit, flag) self.task_events_mgr.process_message( itask, INFO, FAIL_MESSAGE_PREFIX + run_signal, time_run_exit, flag) elif run_status == "1": # The job has terminated, but is still managed by batch system. # Some batch system may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, time_run, flag) elif run_status == "0": # The job succeeded self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_SUCCEEDED, time_run_exit, flag) elif time_run and batch_sys_exit_polled == "1": # The job has terminated without executing the error trap self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(), flag) elif time_run: # The job has started, and is still managed by batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, time_run, flag) elif batch_sys_exit_polled == "1": # The job never ran, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, time_submit_exit, flag) else: # The job never ran, and is in batch system self.task_events_mgr.process_message( itask, INFO, TASK_STATUS_SUBMITTED, time_submit_exit, flag)
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists( get_task_job_log(schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SuiteProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on message of one task job.""" ctx = SuiteProcContext(self.JOBS_POLL, None) ctx.out = line try: event_time, severity, message = line.split("|")[2:5] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = 0 self.task_events_mgr.process_message( itask, severity, message, self.poll_task_jobs, event_time) log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
def _submit_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _submit_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_SUBMIT, None) ctx.out = line items = line.split("|") try: ctx.timestamp, _, ctx.ret_code = items[0:3] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) if ctx.ret_code == SuiteProcPool.RET_CODE_SUITE_STOPPING: return try: itask.summary['submit_method_id'] = items[3] except IndexError: itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] == "None": itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] and ctx.ret_code == 0: self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp) else: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp)
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if cylc.flags.debug: cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append( GLOBAL_CFG.get_derived_host_item(suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SuiteProcContext(cmd_key, cmd), callback, [suite, itasks])
def _event_email_callback(self, proc_ctx, schd_ctx): """Call back when email notification command exits.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: if proc_ctx.ret_code == 0: del self.event_timers[id_key] log_ctx = SuiteProcContext((key1, submit_num), None) log_ctx.ret_code = 0 log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) else: self.event_timers[id_key].unset_waiting() except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def _prep_submit_task_job(self, suite, itask, dry_run): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask try: job_conf = self._prep_submit_task_job_impl(suite, itask) local_job_file_path = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE) self.job_file_writer.write(local_job_file_path, job_conf) except Exception, exc: # Could be a bad command template. LOG.error(traceback.format_exc()) self.task_events_mgr.log_task_job_activity( SuiteProcContext( self.JOBS_SUBMIT, '(prepare job file)', err=exc, ret_code=1), suite, itask.point, itask.tdef.name) if not dry_run: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs) return
def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" if ctx.user_at_host and "@" in ctx.user_at_host: s_user, s_host = ctx.user_at_host.split("@", 1) else: s_user, s_host = (None, ctx.user_at_host) ssh_str = str(glbl_cfg().get_host_item("ssh command", s_host, s_user)) rsync_str = str(glbl_cfg().get_host_item("retrieve job logs command", s_host, s_user)) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if cylc.flags.debug: cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size, )) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append(ctx.user_at_host + ":" + glbl_cfg().get_derived_host_item( schd_ctx.suite, "suite job log directory", s_host, s_user) + "/") # Local target cmd.append(glbl_cfg().get_derived_host_item( schd_ctx.suite, "suite job log directory") + "/") self.proc_pool.put_command( SuiteProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity(SuiteProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Perist self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def _event_email_callback(self, proc_ctx, schd_ctx): """Call back when email notification command exits.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: if proc_ctx.ret_code == 0: del self.event_timers[id_key] log_ctx = SuiteProcContext((key1, submit_num), None) log_ctx.ret_code = 0 log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) else: self.event_timers[id_key].unset_waiting() except KeyError: if cylc.flags.debug: ERR.debug(traceback.format_exc())
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs.""" if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks = self.prep_submit_task_jobs(suite, itasks) if not prepared_tasks: return # Submit task jobs auth_itasks = {} for itask in prepared_tasks: # The job file is now (about to be) used: reset the file write flag # so that subsequent manual retrigger will generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for auth, itasks in sorted(auth_itasks.items()): cmd = ["cylc", self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append("--debug") host, owner = auth remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append("--") cmd.append(GLOBAL_CFG.get_derived_host_item( suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE)) job_log_dirs.append(self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext( self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks])
def _submit_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _submit_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_SUBMIT, None) ctx.out = line items = line.split("|") try: ctx.timestamp, _, ctx.ret_code = items[0:3] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name) if ctx.ret_code == SuiteProcPool.JOB_SKIPPED_FLAG: return try: itask.summary['submit_method_id'] = items[3] except IndexError: itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] == "None": itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] and ctx.ret_code == 0: self.task_events_mgr.process_message( itask, INFO, '%s at %s' % ( TASK_OUTPUT_SUBMITTED, ctx.timestamp)) else: self.task_events_mgr.process_message( itask, CRITICAL, '%s at %s' % ( self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp))
def remote_host_select(self, host_str): """Evaluate a task host string. Arguments: host_str (str): An explicit host name, a command in back-tick or $(command) format, or an environment variable holding a hostname. Return (str): None if evaluate of host_str is still taking place. 'localhost' if host_str is not defined or if the evaluated host name is equivalent to 'localhost'. Otherwise, return the evaluated host name on success. Raise TaskRemoteMgmtError on error. """ if not host_str: return 'localhost' # Host selection command: $(command) or `command` match = REC_COMMAND.match(host_str) if match: cmd_str = match.groups()[1] if cmd_str in self.remote_host_str_map: # Command recently launched value = self.remote_host_str_map[cmd_str] if isinstance(value, TaskRemoteMgmtError): raise value # command failed elif value is None: return # command not yet ready else: host_str = value # command succeeded else: # Command not launched (or already reset) timeout = glbl_cfg().get(['task host select command timeout']) if timeout: cmd = ['timeout', str(int(timeout)), 'bash', '-c', cmd_str] else: cmd = ['bash', '-c', cmd_str] self.proc_pool.put_command( SuiteProcContext('remote-host-select', cmd, env=dict(os.environ)), self._remote_host_select_callback, [cmd_str]) self.remote_host_str_map[cmd_str] = None return self.remote_host_str_map[cmd_str] # Environment variable substitution host_str = os.path.expandvars(host_str) # Remote? if is_remote_host(host_str): return host_str else: return 'localhost'
def _process_event_email(self, schd_ctx, ctx, id_keys): """Process event notification, by email.""" if len(id_keys) == 1: # 1 event from 1 task (_, event), point, name, submit_num = id_keys[0] subject = "[%s/%s/%02d %s] %s" % (point, name, submit_num, event, schd_ctx.suite) else: event_set = set(id_key[0][1] for id_key in id_keys) if len(event_set) == 1: # 1 event from n tasks subject = "[%d tasks %s] %s" % (len(id_keys), event_set.pop(), schd_ctx.suite) else: # n events from n tasks subject = "[%d task events] %s" % (len(id_keys), schd_ctx.suite) cmd = ["mail", "-s", subject] # From: and To: cmd.append("-r") cmd.append(ctx.mail_from) cmd.append(ctx.mail_to) # STDIN for mail, tasks stdin_str = "" for id_key in sorted(id_keys): (_, event), point, name, submit_num = id_key stdin_str += "%s: %s/%s/%02d\n" % (event, point, name, submit_num) # STDIN for mail, event info + suite detail stdin_str += "\n" for label, value in [('suite', schd_ctx.suite), ("host", schd_ctx.host), ("port", schd_ctx.port), ("owner", schd_ctx.owner)]: if value: stdin_str += "%s: %s\n" % (label, value) if self.mail_footer: stdin_str += (self.mail_footer + "\n") % { "host": schd_ctx.host, "port": schd_ctx.port, "owner": schd_ctx.owner, "suite": schd_ctx.suite } # SMTP server env = dict(os.environ) mail_smtp = ctx.mail_smtp if mail_smtp: env["smtp"] = mail_smtp self.proc_pool.put_command( SuiteProcContext( ctx, cmd, env=env, stdin_str=stdin_str, id_keys=id_keys, ), self._event_email_callback, [schd_ctx])
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and (ctx.event in self.get_events_conf( config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'suite': quote(ctx.suite), 'message': quote(ctx.reason), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SuiteProcContext(cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.is_closed(): # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command(proc_ctx, self._run_event_handlers_callback)
def _run_event_mail(self, config, ctx): """Helper for "run_event_handlers", do mail notification.""" if ctx.event in self.get_events_conf(config, 'mail events', []): # SMTP server env = dict(os.environ) mail_smtp = self.get_events_conf(config, 'mail smtp') if mail_smtp: env['smtp'] = mail_smtp subject = '[suite %(event)s] %(suite)s' % { 'suite': ctx.suite, 'event': ctx.event } stdin_str = '' for name, value in [('suite event', ctx.event), ('reason', ctx.reason), ('suite', ctx.suite), ('host', ctx.host), ('port', ctx.port), ('owner', ctx.owner)]: if value: stdin_str += '%s: %s\n' % (name, value) mail_footer_tmpl = self.get_events_conf(config, 'mail footer') if mail_footer_tmpl: stdin_str += (mail_footer_tmpl + '\n') % { 'host': ctx.host, 'port': ctx.port, 'owner': ctx.owner, 'suite': ctx.suite } proc_ctx = SuiteProcContext( (self.SUITE_EVENT_HANDLER, ctx.event), [ 'mail', '-s', subject, '-r', self.get_events_conf(config, 'mail from', 'notifications@' + get_host()), self.get_events_conf(config, 'mail to', get_user()), ], env=env, stdin_str=stdin_str) if self.proc_pool.is_closed(): # Run command in foreground if process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command(proc_ctx, self._run_event_mail_callback)
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure self.task_events_mgr.log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, "%s at %s" % (self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp), self.poll_task_jobs) cylc.flags.iflag = True elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message(itask, CRITICAL, TASK_OUTPUT_FAILED, self.poll_task_jobs) cylc.flags.iflag = True else: log_lvl = WARNING log_msg = ('ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.summary['latest_message'] = log_msg LOG.log( log_lvl, "[%s] -job(%02d) %s" % (itask.identity, itask.submit_num, log_msg))
def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on message of one task job.""" ctx = SuiteProcContext(self.JOBS_POLL, None) ctx.out = line try: event_time, priority, message = line.split("|")[2:5] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = 0 self.task_events_mgr.process_message( itask, priority, message, self.poll_task_jobs, event_time) self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name)
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, "%s at %s" % ( self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp)) cylc.flags.iflag = True elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message( itask, CRITICAL, TASK_OUTPUT_FAILED) cylc.flags.iflag = True else: log_lvl = WARNING log_msg = ( 'ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.summary['latest_message'] = log_msg LOG.log(log_lvl, "[%s] -job(%02d) %s" % ( itask.identity, itask.submit_num, log_msg))
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": HTTP(S) and SSH+HTTP(S) task comm ".service/passphrase": HTTP(S) task comm ".service/ssl.cert": HTTPS task comm "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install items = self._remote_init_items(host, owner) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create "stdin_file_paths" file, with "items" in it. tmphandle = NamedTemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid') if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid)) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(str(self.uuid)) cmd.append(glbl_cfg().get_derived_host_item(self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SuiteProcContext('remote-init', cmd, stdin_file_paths=[tmphandle.name]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def process_events(self, schd_ctx): """Process task events that were created by "setup_event_handlers". schd_ctx is an instance of "Schduler" in "cylc.scheduler". """ ctx_groups = {} now = time() for id_key, timer in self.event_timers.copy().items(): key1, point, name, submit_num = id_key if timer.is_waiting: continue # Set timer if timeout is None. if not timer.is_timeout_set(): if timer.next() is None: LOG.warning("%s/%s/%02d %s failed" % (point, name, submit_num, key1)) del self.event_timers[id_key] continue # Report retries and delayed 1st try tmpl = None if timer.num > 1: tmpl = "%s/%s/%02d %s failed, retrying in %s" elif timer.delay: tmpl = "%s/%s/%02d %s will run after %s" if tmpl: LOG.debug(tmpl % (point, name, submit_num, key1, timer.delay_timeout_as_str())) # Ready to run? if not timer.is_delay_done() or ( # Avoid flooding user's mail box with mail notification. # Group together as many notifications as possible within a # given interval. timer.ctx.ctx_type == self.HANDLER_MAIL and not schd_ctx.stop_mode and self.next_mail_time is not None and self.next_mail_time > now): continue timer.set_waiting() if timer.ctx.ctx_type == self.HANDLER_CUSTOM: # Run custom event handlers on their own self.proc_pool.put_command( SuiteProcContext( (key1, submit_num), timer.ctx.cmd, env=os.environ, shell=True, ), self._custom_handler_callback, [schd_ctx, id_key]) else: # Group together built-in event handlers, where possible if timer.ctx not in ctx_groups: ctx_groups[timer.ctx] = [] ctx_groups[timer.ctx].append(id_key) next_mail_time = now + self.mail_interval for ctx, id_keys in ctx_groups.items(): if ctx.ctx_type == self.HANDLER_MAIL: # Set next_mail_time if any mail sent self.next_mail_time = next_mail_time self._process_event_email(schd_ctx, ctx, id_keys) elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE: self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 items = line.split("|") # See cylc.batch_sys_manager.JobPollContext try: (batch_sys_exit_polled, run_status, run_signal, time_submit_exit, time_run, time_run_exit) = items[4:10] except IndexError: itask.summary['latest_message'] = 'poll failed' cylc.flags.iflag = True ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: self.task_events_mgr.log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) if run_status == "1" and run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs, time_run_exit) elif run_status == "1" and batch_sys_exit_polled == "1": # Failed by a signal, and no longer in batch system self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs, time_run_exit) self.task_events_mgr.process_message( itask, INFO, TaskMessage.FAIL_MESSAGE_PREFIX + run_signal, self.poll_task_jobs, time_run_exit) elif run_status == "1": # The job has terminated, but is still managed by batch system. # Some batch system may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, self.poll_task_jobs, time_run) elif run_status == "0": # The job succeeded self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_SUCCEEDED, self.poll_task_jobs, time_run_exit) elif time_run and batch_sys_exit_polled == "1": # The job has terminated without executing the error trap self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, self.poll_task_jobs, "") elif time_run: # The job has started, and is still managed by batch system self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, self.poll_task_jobs, time_run) elif batch_sys_exit_polled == "1": # The job never ran, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs, time_submit_exit) else: # The job never ran, and is in batch system self.task_events_mgr.process_message(itask, INFO, TASK_STATUS_SUBMITTED, self.poll_task_jobs, time_submit_exit)
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) if not prepared_tasks: return bad_tasks # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.summary['latest_message'] = self.REMOTE_INIT_MSG continue # Persist if owner: owner_at_host = owner + '@' + host else: owner_at_host = host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and perist LOG.info('submit-num=%d, owner@host=%s' % (itask.submit_num, owner_at_host), itask=itask) self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry self.task_events_mgr.log_task_job_activity( SuiteProcContext(self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if cylc.flags.debug: cmd.append('--debug') remote_mode = False kwargs = {} for key, value, test_func in [('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append( GLOBAL_CFG.get_derived_host_item(suite, 'suite job log directory', host, owner)) stdin_file_paths = [] job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): if remote_mode: stdin_file_paths.append( self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE)) job_log_dirs.append( self.task_events_mgr.get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file write # flag so that subsequent manual retrigger will generate a new # job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) cmd += job_log_dirs self.proc_pool.put_command( SuiteProcContext(self.JOBS_SUBMIT, cmd, stdin_file_paths=stdin_file_paths, job_log_dirs=job_log_dirs, **kwargs), self._submit_task_jobs_callback, [suite, itasks]) return done_tasks