def _submit_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _submit_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_SUBMIT, None) ctx.out = line items = line.split("|") try: ctx.timestamp, _, ctx.ret_code = items[0:3] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING: return job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) try: itask.summary['submit_method_id'] = items[3] self.job_pool.set_job_attr(job_d, 'batch_sys_job_id', items[3]) except IndexError: itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] == "None": itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] and ctx.ret_code == 0: self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp) else: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp)
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp) elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message(itask, CRITICAL, TASK_OUTPUT_FAILED) else: log_lvl = DEBUG log_msg = ('ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.set_summary_message(log_msg) LOG.log( log_lvl, "[%s] -job(%02d) %s" % (itask.identity, itask.submit_num, log_msg))
def _event_email_callback(self, proc_ctx, schd_ctx): """Call back when email notification command exits.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: if proc_ctx.ret_code == 0: del self.event_timers[id_key] log_ctx = SubProcContext((key1, submit_num), None) log_ctx.ret_code = 0 log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) else: self.event_timers[id_key].unset_waiting() except KeyError as exc: LOG.exception(exc)
def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on message of one task job.""" ctx = SubProcContext(self.JOBS_POLL, None) ctx.out = line try: event_time, severity, message = line.split("|")[2:5] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = 0 self.task_events_mgr.process_message( itask, severity, message, event_time, self.task_events_mgr.FLAG_POLLED) log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
def test_run_command_with_stdin_from_unicode(self): """Test STDIN from string with Unicode""" ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') SubProcPool.run_command(ctx) self.assertEqual(ctx.err, '') self.assertEqual(ctx.out, '喵\n') self.assertEqual(ctx.ret_code, 0)
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handlers' % ctx.event) if not handlers and (ctx.event in self.get_events_conf( config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.WORKFLOW_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution # BACK COMPAT: suite, suite_uuid are deprecated # url: # https://github.com/cylc/cylc-flow/pull/4174 # from: # Cylc 8 # remove at: # Cylc 9 try: handler_data = { 'event': quote(ctx.event), 'message': quote(ctx.reason), 'workflow': quote(ctx.workflow), 'workflow_uuid': quote(ctx.uuid_str), 'suite': quote(ctx.workflow), # deprecated 'suite_uuid': quote(ctx.uuid_str), # deprecated } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["workflow_url"] = quote(value) handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.workflow, ctx.reason) proc_ctx = SubProcContext( cmd_key, cmd, env=dict(os.environ), shell=True # nosec (designed to run user defined code) ) if self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, callback=self._run_event_handlers_callback)
def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" if ctx.user_at_host and "@" in ctx.user_at_host: s_user, s_host = ctx.user_at_host.split("@", 1) else: s_user, s_host = (None, ctx.user_at_host) ssh_str = str(glbl_cfg().get_host_item("ssh command", s_host, s_user)) rsync_str = str(glbl_cfg().get_host_item("retrieve job logs command", s_host, s_user)) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if LOG.isEnabledFor(DEBUG): cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size, )) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append( "%s:%s/" % (ctx.user_at_host, get_remote_suite_run_job_dir(s_host, s_user, schd_ctx.suite))) # Local target cmd.append(get_suite_run_job_dir(schd_ctx.suite) + "/") self.proc_pool.put_command( SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(get_remote_suite_run_job_dir(host, owner, suite)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity(SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def test_run_command_returns_0(self): """Test basic usage, command returns 0""" ctx = SubProcContext('truth', ['true']) SubProcPool.run_command(ctx) self.assertEqual(ctx.err, '') self.assertEqual(ctx.out, '') self.assertEqual(ctx.ret_code, 0)
def test_run_command_writes_to_out(self): """Test basic usage, command writes to STDOUT""" ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) SubProcPool.run_command(ctx) self.assertEqual(ctx.err, '') self.assertEqual(ctx.out, 'pirate urrrr\n') self.assertEqual(ctx.ret_code, 0)
def test_run_command_returns_1(self): """Test basic usage, command returns 1""" ctx = SubProcContext('lies', ['false']) SubProcPool.run_command(ctx) self.assertEqual(ctx.err, '') self.assertEqual(ctx.out, '') self.assertEqual(ctx.ret_code, 1)
def test_run_command_writes_to_err(self): """Test basic usage, command writes to STDERR""" ctx = SubProcContext('parrot2', ['bash', '-c', 'echo pirate errrr >&2']) SubProcPool.run_command(ctx) self.assertEqual(ctx.err, 'pirate errrr\n') self.assertEqual(ctx.out, '') self.assertEqual(ctx.ret_code, 0)
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp) elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message( itask, CRITICAL, TASK_OUTPUT_FAILED) else: log_lvl = DEBUG log_msg = ( 'ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.set_summary_message(log_msg) LOG.log(log_lvl, "[%s] -job(%02d) %s" % ( itask.identity, itask.submit_num, log_msg))
def _submit_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _submit_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_SUBMIT, None) ctx.out = line items = line.split("|") try: ctx.timestamp, _, ctx.ret_code = items[0:3] except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING: return try: itask.summary['submit_method_id'] = items[3] except IndexError: itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] == "None": itask.summary['submit_method_id'] = None if itask.summary['submit_method_id'] and ctx.ret_code == 0: self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp) else: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp)
def _process_event_email(self, schd_ctx, ctx, id_keys): """Process event notification, by email.""" if len(id_keys) == 1: # 1 event from 1 task (_, event), point, name, submit_num = id_keys[0] subject = "[%s/%s/%02d %s] %s" % (point, name, submit_num, event, schd_ctx.suite) else: event_set = set(id_key[0][1] for id_key in id_keys) if len(event_set) == 1: # 1 event from n tasks subject = "[%d tasks %s] %s" % (len(id_keys), event_set.pop(), schd_ctx.suite) else: # n events from n tasks subject = "[%d task events] %s" % (len(id_keys), schd_ctx.suite) cmd = ["mail", "-s", subject] # From: and To: cmd.append("-r") cmd.append(ctx.mail_from) cmd.append(ctx.mail_to) # STDIN for mail, tasks stdin_str = "" for id_key in sorted(id_keys): (_, event), point, name, submit_num = id_key stdin_str += "%s: %s/%s/%02d\n" % (event, point, name, submit_num) # STDIN for mail, event info + suite detail stdin_str += "\n" for label, value in [('suite', schd_ctx.suite), ("host", schd_ctx.host), ("port", schd_ctx.port), ("owner", schd_ctx.owner)]: if value: stdin_str += "%s: %s\n" % (label, value) if self.mail_footer: stdin_str += (self.mail_footer + "\n") % { "host": schd_ctx.host, "port": schd_ctx.port, "owner": schd_ctx.owner, "suite": schd_ctx.suite } # SMTP server env = dict(os.environ) mail_smtp = ctx.mail_smtp if mail_smtp: env["smtp"] = mail_smtp self.proc_pool.put_command( SubProcContext( ctx, cmd, env=env, stdin_str=stdin_str, id_keys=id_keys, ), self._event_email_callback, [schd_ctx])
def test_run_command_with_stdin_from_path(self): """Test STDIN from a single file path""" handle = NamedTemporaryFile() handle.write('catches mice.\n'.encode('UTF-8')) handle.seek(0) ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) SubProcPool.run_command(ctx) self.assertEqual(ctx.err, '') self.assertEqual(ctx.out, 'catches mice.\n') self.assertEqual(ctx.ret_code, 0) handle.close()
def remote_init(self, platform: Dict[str, Any], curve_auth: 'ThreadAuthenticator', client_pub_key_dir: str) -> None: """Initialise a remote host if necessary. Call "cylc remote-init" to install workflow items to remote: ".service/contact": For TCP task communication "python/": if source exists Args: platform: A dict containing settings relating to platform used in this remote installation. curve_auth: The ZMQ authenticator. client_pub_key_dir: Client public key directory, used by the ZMQ authenticator. """ install_target = platform['install target'] if install_target == get_localhost_install_target(): self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_DONE return # Set status of install target to in progress while waiting for remote # initialisation to finish self.remote_init_map[install_target] = REMOTE_INIT_IN_PROGRESS # Determine what items to install comms_meth: CommsMeth = CommsMeth(platform['communication method']) items = self._remote_init_items(comms_meth) # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # Build the remote-init command to be run over ssh cmd = ['remote-init'] if cylc.flow.flags.verbosity > 1: cmd.append('--debug') cmd.append(str(install_target)) cmd.append(get_remote_workflow_run_dir(self.workflow)) dirs_to_symlink = get_dirs_to_symlink(install_target, self.workflow) for key, value in dirs_to_symlink.items(): if value is not None: cmd.append(f"{key}={quote(value)} ") # Create the ssh command cmd = construct_ssh_cmd(cmd, platform) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [platform, tmphandle, curve_auth, client_pub_key_dir])
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and (ctx.event in self.get_events_conf( config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'message': quote(ctx.reason), 'suite': quote(ctx.suite), 'suite_uuid': quote(str(ctx.uuid_str)), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SubProcContext(cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command(proc_ctx, self._run_event_handlers_callback)
def test_log_debug_on_noerror_exit_code(self, cylc_log): """Test that a debug log is emitted when the log retrieval command exited with an non-error code (i.e. 0). :param cylc_log: mocked cylc logger :type cylc_log: mock.MagicMock """ task_events_manager = TaskEventsManager(None, None, None, None, None) proc_ctx = SubProcContext(cmd_key=None, cmd="ls /tmp/123", ret_code=0, err="", id_keys=[]) task_events_manager._job_logs_retrieval_callback(proc_ctx, None) self.assertEqual(1, cylc_log.debug.call_count) self.assertTrue(cylc_log.debug.call_args.contains("ls /tmp/123"))
def test_log_error_on_error_exit_code(self, cylc_log): """Test that an error log is emitted when the log retrieval command exited with a code different than zero. :param cylc_log: mocked cylc logger :type cylc_log: mock.MagicMock """ task_events_manager = TaskEventsManager(None, None, None, None, None) proc_ctx = SubProcContext(cmd_key=None, cmd="error", ret_code=1, err="Error!", id_keys=[]) task_events_manager._job_logs_retrieval_callback(proc_ctx, None) self.assertEqual(1, cylc_log.error.call_count) self.assertTrue(cylc_log.error.call_args.contains("Error!"))
def remote_host_select(self, host_str): """Evaluate a task host string. Arguments: host_str (str): An explicit host name, a command in back-tick or $(command) format, or an environment variable holding a hostname. Return (str): None if evaluate of host_str is still taking place. 'localhost' if host_str is not defined or if the evaluated host name is equivalent to 'localhost'. Otherwise, return the evaluated host name on success. Raise TaskRemoteMgmtError on error. """ if not host_str: return 'localhost' # Host selection command: $(command) or `command` match = REC_COMMAND.match(host_str) if match: cmd_str = match.groups()[1] if cmd_str in self.remote_host_str_map: # Command recently launched value = self.remote_host_str_map[cmd_str] if isinstance(value, TaskRemoteMgmtError): raise value # command failed elif value is None: return # command not yet ready else: host_str = value # command succeeded else: # Command not launched (or already reset) self.proc_pool.put_command( SubProcContext('remote-host-select', ['bash', '-c', cmd_str], env=dict(os.environ)), self._remote_host_select_callback, [cmd_str]) self.remote_host_str_map[cmd_str] = None return self.remote_host_str_map[cmd_str] # Environment variable substitution host_str = os.path.expandvars(host_str) # Remote? if is_remote_host(host_str): return host_str else: return 'localhost'
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" if proc_ctx.ret_code: LOG.error(proc_ctx) else: LOG.debug(proc_ctx) for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists( get_task_job_log(schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SubProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError as exc: LOG.exception(exc)
def test_run_command_with_stdin_from_handles(self): """Test STDIN from multiple file handles""" handles = [] for txt in ['catches mice.\n', 'eat fish.\n']: handle = TemporaryFile() handle.write(txt.encode('UTF-8')) handle.seek(0) handles.append(handle) ctx = SubProcContext('meow', ['cat'], stdin_files=handles) SubProcPool.run_command(ctx) self.assertEqual(ctx.err, '') self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') self.assertEqual(ctx.ret_code, 0) for handle in handles: handle.close()
def _run_event_mail(self, config, ctx): """Helper for "run_event_handlers", do mail notification.""" if ctx.event in self.get_events_conf(config, 'mail events', []): # SMTP server env = dict(os.environ) mail_smtp = self.get_events_conf(config, 'mail smtp') if mail_smtp: env['smtp'] = mail_smtp subject = '[suite %(event)s] %(suite)s' % { 'suite': ctx.suite, 'event': ctx.event} stdin_str = '' for name, value in [ ('suite event', ctx.event), ('reason', ctx.reason), ('suite', ctx.suite), ('host', ctx.host), ('port', ctx.port), ('owner', ctx.owner)]: if value: stdin_str += '%s: %s\n' % (name, value) mail_footer_tmpl = self.get_events_conf(config, 'mail footer') if mail_footer_tmpl: stdin_str += (mail_footer_tmpl + '\n') % { 'host': ctx.host, 'port': ctx.port, 'owner': ctx.owner, 'suite': ctx.suite} proc_ctx = SubProcContext( (self.SUITE_EVENT_HANDLER, ctx.event), [ 'mail', '-s', subject, '-r', self.get_events_conf( config, 'mail from', 'notifications@' + get_host()), self.get_events_conf(config, 'mail to', get_user()), ], env=env, stdin_str=stdin_str) if self.proc_pool.closed: # Run command in foreground if process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, self._run_event_mail_callback)
def _run_event_custom_handlers(self, schd, template_variables, event): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers config = schd.config handlers = self.get_events_conf(config, '%s handlers' % event) if not handlers and ( event in self.get_events_conf(config, 'handler events', []) ): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.WORKFLOW_EVENT_HANDLER, i), event) try: cmd = handler % (template_variables) except KeyError as exc: message = f'{cmd_key} bad template: {handler}\n{exc}' LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = ( f"%(handler)s" f" '%({EventData.Event.value})s'" f" '%({EventData.Workflow.value})s'" f" '%({EventData.Message.value})s'" ) % ( {'handler': handler, **template_variables} ) proc_ctx = SubProcContext( cmd_key, cmd, env=dict(os.environ), shell=True # nosec (designed to run user defined code) ) if self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, callback=self._run_event_handlers_callback)
def file_install(self, platform): """Install required files on the remote install target. Included by default in the file installation: Files: .service/server.key (required for ZMQ authentication) Directories: app/ bin/ etc/ lib/ """ install_target = platform['install target'] self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_IN_PROGRESS src_path = get_workflow_run_dir(self.workflow) dst_path = get_remote_workflow_run_dir(self.workflow) install_target = platform['install target'] try: cmd, host = construct_rsync_over_ssh_cmd(src_path, dst_path, platform, self.rsync_includes, bad_hosts=self.bad_hosts) ctx = SubProcContext('file-install', cmd, host) except NoHostsError as exc: LOG.error( PlatformError( f'{PlatformError.MSG_INIT}\n{exc}', platform['name'], )) self.remote_init_map[ platform['install target']] = REMOTE_FILE_INSTALL_FAILED self.bad_hosts -= set(platform['hosts']) self.ready = True else: log_platform_event('file install', platform, host) self.proc_pool.put_command( ctx, bad_hosts=self.bad_hosts, callback=self._file_install_callback, callback_args=[platform, install_target], callback_255=self._file_install_callback_255, )
def _send_mail(self, event, subject, message, schd, env): proc_ctx = SubProcContext( (self.WORKFLOW_EVENT_HANDLER, event), [ 'mail', '-s', subject, '-r', self.get_events_conf( schd.config, 'from', 'notifications@' + get_host()), self.get_events_conf(schd.config, 'to', get_user()), ], env=env, stdin_str=message) if self.proc_pool.closed: # Run command in foreground if process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback(proc_ctx) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, callback=self._run_event_mail_callback)
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their platform_name and host. Put a job command for each group to the multiprocess pool. """ if not itasks: return # sort itasks into lists based upon where they were run. auth_itasks = {} for itask in itasks: platform_n = itask.platform['name'] if platform_n not in auth_itasks: auth_itasks[platform_n] = [] auth_itasks[platform_n].append(itask) # Go through each list of itasks and carry out commands as required. for platform_n, itasks in sorted(auth_itasks.items()): platform = get_platform(platform_n) if is_remote_platform(platform): remote_mode = True cmd = [cmd_key] else: cmd = ["cylc", cmd_key] remote_mode = False if LOG.isEnabledFor(DEBUG): cmd.append("--debug") cmd.append("--") cmd.append(get_remote_suite_run_job_dir(platform, suite)) job_log_dirs = [] if remote_mode: cmd = construct_ssh_cmd(cmd, platform) for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append( get_task_job_id(itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command(SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def _prep_submit_task_job_error(self, workflow, itask, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) log_task_job_activity(SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), workflow, itask.point, itask.tdef.name, submit_num=itask.submit_num) # Persist self.workflow_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'job_runner_name': itask.summary.get('job_runner_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def file_install(self, platform): """Install required files on the remote install target. Included by default in the file installation: Files: .service/server.key (required for ZMQ authentication) Directories: app/ bin/ etc/ lib/ """ install_target = platform['install target'] self.remote_init_map[install_target] = REMOTE_FILE_INSTALL_IN_PROGRESS src_path = get_workflow_run_dir(self.workflow) dst_path = get_remote_workflow_run_dir(self.workflow) install_target = platform['install target'] ctx = SubProcContext( 'file-install', construct_rsync_over_ssh_cmd(src_path, dst_path, platform, self.rsync_includes)) LOG.debug(f"Begin file installation on {install_target}") self.proc_pool.put_command(ctx, self._file_install_callback, [install_target])
def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 # See cylc.flow.batch_sys_manager.JobPollContext try: job_log_dir, context = line.split('|')[1:3] items = json.loads(context) jp_ctx = JobPollContext(job_log_dir, **items) except TypeError: itask.set_summary_message(self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return except ValueError: # back compat for cylc 7.7.1 and previous try: values = line.split('|') items = dict( # done this way to ensure IndexError is raised (key, values[x]) for x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES)) job_log_dir = items.pop('job_log_dir') except (ValueError, IndexError): itask.set_summary_message(self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) flag = self.task_events_mgr.FLAG_POLLED if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1: # Failed by a signal, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) self.task_events_mgr.process_message( itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1: # The job has terminated, but is still managed by batch system. # Some batch system may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.run_status == 0: # The job succeeded self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit, flag) elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1: # The job has terminated without executing the error trap self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(), flag) elif jp_ctx.time_run: # The job has started, and is still managed by batch system self.task_events_mgr.process_message( itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.batch_sys_exit_polled == 1: # The job never ran, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, jp_ctx.time_submit_exit, flag) else: # The job never ran, and is in batch system self.task_events_mgr.process_message( itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit, flag)
def _poll_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _poll_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_POLL, None) ctx.out = line ctx.ret_code = 0 # See cylc.flow.batch_sys_manager.JobPollContext job_d = get_task_job_id(itask.point, itask.tdef.name, itask.submit_num) try: job_log_dir, context = line.split('|')[1:3] items = json.loads(context) jp_ctx = JobPollContext(job_log_dir, **items) except TypeError: itask.set_summary_message(self.POLL_FAIL) self.job_pool.add_job_msg(job_d, self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return except ValueError: # back compat for cylc 7.7.1 and previous try: values = line.split('|') items = dict( # done this way to ensure IndexError is raised (key, values[x]) for x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES)) job_log_dir = items.pop('job_log_dir') except (ValueError, IndexError): itask.set_summary_message(self.POLL_FAIL) self.job_pool.add_job_msg(job_d, self.POLL_FAIL) ctx.cmd = cmd_ctx.cmd # print original command on failure return finally: log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) flag = self.task_events_mgr.FLAG_POLLED if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]: # Failed normally self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1: # Failed by a signal, and no longer in batch system self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag) self.task_events_mgr.process_message( itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal, jp_ctx.time_run_exit, flag) elif jp_ctx.run_status == 1: # The job has terminated, but is still managed by batch system. # Some batch system may restart a job in this state, so don't # mark as failed yet. self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.run_status == 0: # The job succeeded self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit, flag) elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1: # The job has terminated without executing the error trap self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(), flag) elif jp_ctx.time_run: # The job has started, and is still managed by batch system self.task_events_mgr.process_message(itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag) elif jp_ctx.batch_sys_exit_polled == 1: # The job never ran, and no longer in batch system self.task_events_mgr.process_message( itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED, jp_ctx.time_submit_exit, flag) else: # The job never ran, and is in batch system self.task_events_mgr.process_message(itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit, flag)