def load_config(self): """Load the suite config.""" if self.suiterc: is_reload = True collapsed = self.suiterc.closed_families else: is_reload = False collapsed = [] try: self.suiterc = SuiteConfig( self.suite, self.file, self.template_vars, is_reload=is_reload, collapsed=collapsed, cli_initial_point_string=self.start_point_string, vis_start_string=self.start_point_string, vis_stop_string=self.stop_point_string) except Exception as exc: msg = "Failed - parsing error?\n\n%s" % exc LOG.error(msg) if self.interactive: dia = gtk.MessageDialog(type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, message_format=msg) dia.run() dia.destroy() return False sys.exit(1) self.inherit = self.suiterc.get_parent_lists() return True
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) itask.summary['submit_method_id'] = None self.pflag = True if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: LOG.info("LOADING broadcast states") point, namespace, key, value = row sections = [] cur_key = key if "]" in cur_key: sections = self.REC_SECTION.findall(cur_key) cur_key = cur_key.rsplit(r"]", 1)[-1] with self.lock: self.broadcasts.setdefault(point, {}) self.broadcasts[point].setdefault(namespace, {}) dict_ = self.broadcasts[point][namespace] for section in sections: dict_.setdefault(section, {}) dict_ = dict_[section] dict_[cur_key] = value LOG.info( CHANGE_FMT.strip() % { "change": CHANGE_PREFIX_SET, "point": point, "namespace": namespace, "key": key, "value": value })
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return prev_status, prev_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif status in TASK_STATUSES_ACTIVE: if self.status == TASK_STATUS_HELD: self.hold_swap = TASK_STATUS_HELD elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() self.is_updated = True # Log message = str(prev_status) if prev_hold_swap: message += " (%s)" % prev_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug("[%s] -%s", self.identity, message) return (prev_status, prev_hold_swap)
def _check_access_priv_and_report(self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.LOG_CONNECT_ALLOWED_TMPL, user, host, prog_name, priv_level, uuid) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.LOG_COMMAND_TMPL, command, user, host, prog_name, uuid) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append( itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED]: msg += "\n " + output if msg: LOG.info( "[%s] -Succeeded with outputs not completed: %s", itask, msg) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def _report_connection_if_denied(self): """Log an (un?)successful connection attempt.""" prog_name, user, host, uuid = _get_client_info()[1:] connection_denied = self._get_client_connection_denied() if connection_denied: LOG.warning(self.LOG_CONNECT_DENIED_TMPL, user, host, prog_name, uuid)
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flags.debug: traceback.print_exc() err_log = ("cannot execute database statement:\n" "file=%(file)s:\nstmt=%(stmt)s") % { "file": self.db_file_name, "stmt": stmt } for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args }) LOG.warning(err_log) raise
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) run_d = glbl_cfg().get_host_item('run directory') results = [] for dirpath, dnames, _ in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a .service/ dir if dirpath != run_d and self.DIR_BASE_SRV in dnames: dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self.get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: LOG.error('%s: %s', reg, exc) return results
def _upgrade_with_state_file_header(self, line): """Parse a header line in state file, add information to DB.""" head, tail = line.split(" : ", 1) if head == "time": self.add_insert_item( self.TABLE_CHECKPOINT_ID, { "id": self.CHECKPOINT_LATEST_ID, "time": tail.split(" ", 1)[0], "event": self.CHECKPOINT_LATEST_EVENT }) return for name, key in [("run mode", "run_mode"), ("initial cycle", "initial_point"), ("final cycle", "final_point")]: if tail == "None": tail = None if head == name: self.add_insert_item(self.TABLE_SUITE_PARAMS, { "key": key, "value": tail }) LOG.info(" + %s=%s", key, tail) if name == "final cycle": return "broadcast" else: return
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp) elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message( itask, CRITICAL, TASK_OUTPUT_FAILED) else: log_lvl = DEBUG log_msg = ( 'ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.set_summary_message(log_msg) LOG.log(log_lvl, "[%s] -job(%02d) %s" % ( itask.identity, itask.submit_num, log_msg))
def _job_cmd_out_callback(suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: owner_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write((owner_at_host + line).encode()) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning("[%s] -%s%s", itask, owner_at_host, line)
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists(get_task_job_log( schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SubProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError as exc: LOG.exception(exc)
def _receiver(self, message): """Wrap incoming messages and dispatch them to exposed methods. Args: message (dict): message contents """ # determine the server method to call try: method = getattr(self, message['command']) args = message['args'] args.update({'user': message['user']}) if 'meta' in message: args['meta'] = message['meta'] except KeyError: # malformed message return {'error': { 'message': 'Request missing required field(s).'}} except AttributeError: # no exposed method by that name return {'error': { 'message': 'No method by the name "%s"' % message['command']}} # generate response try: response = method(**args) except Exception as exc: # includes incorrect arguments (TypeError) LOG.exception(exc) # note the error server side import traceback return {'error': { 'message': str(exc), 'traceback': traceback.format_exc()}} return {'data': response}
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num, "failed") else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def check_job_time(self, itask, now): """Check/handle job timeout and poll timer""" can_poll = self.check_poll_time(itask, now) if itask.timeout is None or now <= itask.timeout: return can_poll # Timeout reached for task, emit event and reset itask.timeout if itask.state.status == TASK_STATUS_RUNNING: time_ref = itask.summary['started_time'] event = 'execution timeout' elif itask.state.status == TASK_STATUS_SUBMITTED: time_ref = itask.summary['submitted_time'] event = 'submission timeout' msg = event try: msg += ' after %s' % intvl_as_str(itask.timeout - time_ref) except (TypeError, ValueError): # Badness in time_ref? pass itask.timeout = None # emit event only once if msg and event: LOG.warning('[%s] -%s', itask, msg) self.setup_event_handlers(itask, event, msg) return True else: return can_poll
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def _upgrade_with_state_file_extras(self): """Upgrade the database tables after reading in state file.""" conn = self.connect() # Rename old tables for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]: conn.execute(r"ALTER TABLE " + t_name + r" RENAME TO " + t_name + "_old") conn.commit() # Create tables with new columns self.create_tables() # Populate new tables using old column data for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]: LOG.info(r"Upgrading %s table", t_name) column_names = [col.name for col in self.tables[t_name].columns] for i, row in enumerate( conn.execute(r"SELECT " + ",".join(column_names) + " FROM " + t_name + "_old")): # These tables can be big, so we don't want to queue the items # in memory. conn.execute(self.tables[t_name].get_insert_stmt(), list(row)) conn.commit() # Drop old tables for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]: conn.execute(r"DROP TABLE " + t_name + "_old") conn.commit()
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) run_d = glbl_cfg().get_host_item('run directory') results = [] for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a: # * .service/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and (self.DIR_BASE_SRV in dnames or "cylc-suite.db" in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self.get_suite_title(reg) ]) except (IOError, SuiteServiceFileError) as exc: LOG.error('%s: %s', reg, exc) return results
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical( "[%s] -job(%02d) %s", itask, itask.submit_num, "failed") else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _remove_bad_hosts(self, mock_host_stats=None): """Return dictionary of 'good' hosts with their metric stats. Run 'get-host-metrics' on each run host in parallel & store extracted stats for hosts, else an empty JSON structure. Filter out 'bad' hosts whereby either metric data cannot be accessed from the command or at least one metric value does not pass a specified threshold. """ if mock_host_stats: # Create fake data for unittest purposes (only). host_stats = dict(mock_host_stats) # Prevent mutable object issues else: if not self.hosts: return {} host_stats = self._get_host_metrics() # Analyse get-host-metrics results for host, data in list(dict(host_stats).items()): if not data: # No results for host (command failed) -> skip. host_stats.pop(host) continue for measure, cutoff in self.parsed_thresholds.items(): datum = data[measure] # Cutoff is a minimum or maximum depending on measure context. if ((datum > cutoff and measure.startswith("load")) or (datum < cutoff and (measure == "memory" or measure.startswith("disk-space")))): # Alert user that threshold has not been met. LOG.warning( "host '%s' did not pass %s threshold " + "(%s %s threshold %s)\n", host, measure, datum, ">" if measure.startswith("load") else "<", cutoff) host_stats.pop(host) break return host_stats
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append(itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [ TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED ]: msg += "\n " + output if msg: LOG.info("[%s] -Succeeded with outputs not completed: %s", itask, msg) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(os.path.join(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = (f.decode() for f in proc.communicate()) if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def create_directory(dir_, name): """Create directory. Raise GlobalConfigError on error.""" try: os.makedirs(dir_, exist_ok=True) except OSError as exc: LOG.exception(exc) raise GlobalConfigError( 'Failed to create directory "' + name + '"')
def create_directory(dir_, name): """Create directory. Raise GlobalConfigError on error.""" try: mkdir_p(dir_) except OSError as exc: LOG.exception(exc) raise GlobalConfigError('Failed to create directory "' + name + '"')
def clear_broadcast(self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options(self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info(get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def upgrade_from_611(self): """Upgrade database on restart with a 6.11.X private database.""" conn = self.connect() # Add hold_swap column task_pool(_checkpoints) tables for t_name in [self.TABLE_TASK_POOL, self.TABLE_TASK_POOL_CHECKPOINTS]: LOG.info("Add hold_swap column to %s", t_name) conn.execute(r"ALTER TABLE " + t_name + r" ADD COLUMN hold_swap TEXT") conn.commit()
def recover_pub_from_pri(self): """Recover public database from private database.""" if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES: self.copy_pri_to_pub() LOG.warning( "%(pub_db_name)s: recovered from %(pri_db_name)s" % { "pub_db_name": self.pub_dao.db_file_name, "pri_db_name": self.pri_dao.db_file_name}) self.pub_dao.n_tries = 0
def clear_broadcast( self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt(table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % { "file": self.db_file_name, "attempt": self.n_tries }) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % { "file": self.db_file_name, "attempt": self.n_tries }) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def satisfy_xclock(self, itask): """Attempt to satisfy itask's clock trigger, if it has one.""" label, sig, ctx, satisfied = self._get_xclock(itask) if satisfied: return if wall_clock(*ctx.func_args, **ctx.func_kwargs): satisfied = True itask.state.xclock = (label, True) self.sat_xclock.append(sig) LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
def _forget_client(self, uuid): """Forget a client.""" try: client_info = self.clients.pop(uuid) except KeyError: return False if client_info.get('err_log_handler') is not None: LOG.removeHandler(client_info.get('err_log_handler')) LOG.debug(self.LOG_FORGET_TMPL, uuid) return True
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and (ctx.event in self.get_events_conf( config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'message': quote(ctx.reason), 'suite': quote(ctx.suite), 'suite_uuid': quote(str(ctx.uuid_str)), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SubProcContext(cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command(proc_ctx, self._run_event_handlers_callback)
def restart_upgrade(self): """Vacuum/upgrade runtime DB on restart.""" # Backward compat, upgrade database with state file if necessary suite_run_d = os.path.dirname(os.path.dirname(self.pub_path)) old_pri_db_path = os.path.join(suite_run_d, 'state', CylcSuiteDAO.OLD_DB_FILE_BASE_NAME) old_pri_db_path_611 = os.path.join( suite_run_d, CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[0]) old_state_file_path = os.path.join(suite_run_d, "state", "state") if (os.path.exists(old_pri_db_path) and os.path.exists(old_state_file_path) and not os.path.exists(self.pri_path)): # Upgrade pre-6.11.X runtime database + state file copy(old_pri_db_path, self.pri_path) pri_dao = self.get_pri_dao() pri_dao.upgrade_with_state_file(old_state_file_path) target = os.path.join(suite_run_d, "state.tar.gz") cmd = ["tar", "-C", suite_run_d, "-czf", target, "state"] if call(cmd, stdin=open(os.devnull)) == 0: rmtree(os.path.join(suite_run_d, "state"), ignore_errors=True) else: try: os.unlink(os.path.join(suite_run_d, "state.tar.gz")) except OSError: pass LOG.error("cannot tar-gzip + remove old state/ directory") # Remove old files as well try: os.unlink(os.path.join(suite_run_d, "cylc-suite-env")) except OSError: pass elif (os.path.exists(old_pri_db_path_611) and not os.path.exists(self.pri_path)): # Upgrade 6.11.X runtime database os.rename(old_pri_db_path_611, self.pri_path) pri_dao = self.get_pri_dao() pri_dao.upgrade_from_611() # Remove old files as well for name in [ CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[1], "cylc-suite-env" ]: try: os.unlink(os.path.join(suite_run_d, name)) except OSError: pass else: pri_dao = self.get_pri_dao() pri_dao.upgrade_pickle_to_json() # Vacuum the primary/private database file pri_dao.vacuum() pri_dao.close()
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt( table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % { "file": self.db_file_name, "attempt": self.n_tries}) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % { "file": self.db_file_name, "attempt": self.n_tries}) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def test_value_error_raises_system_exit(self, mocked_glbl_cfg): """Test that a ValueError when writing to a log stream won't result in multiple exceptions (what could lead to infinite loop in some occasions. Instead, it **must** raise a SystemExit""" with tempfile.NamedTemporaryFile() as tf: # mock objects used when creating the file handler mocked = mock.MagicMock() mocked_glbl_cfg.return_value = mocked mocked.get_derived_host_item.return_value = tf.name mocked.get.return_value = 100 file_handler = TimestampRotatingFileHandler("suiteA", False) # next line is important as pytest can have a "Bad file descriptor" # due to a FileHandler with default "a" (pytest tries to r/w). file_handler.mode = "a+" # enable the logger LOG.setLevel(logging.INFO) LOG.addHandler(file_handler) # Disable raising uncaught exceptions in logging, due to file # handler using stdin.fileno. See the following links for more. # https://github.com/pytest-dev/pytest/issues/2276 & # https://github.com/pytest-dev/pytest/issues/1585 logging.raiseExceptions = False # first message will initialize the stream and the handler LOG.info("What could go") # here we change the stream of the handler old_stream = file_handler.stream file_handler.stream = mock.MagicMock() file_handler.stream.seek = mock.MagicMock() # in case where file_handler.stream.seek.side_effect = ValueError try: # next call will call the emit method and use the mocked stream LOG.info("wrong?!") self.fail("Exception SystemError was not raised") except SystemExit: pass finally: # clean up file_handler.stream = old_stream # for log_handler in LOG.handlers: # log_handler.close() file_handler.close() LOG.removeHandler(file_handler) logging.raiseExceptions = True
def _authorise(self, *args, user='******', meta=None, **kwargs): if not meta: meta = {} host = meta.get('host', '?') prog = meta.get('prog', '?') usr_priv_level = self._get_priv_level(user) if usr_priv_level < req_priv_level: LOG.warn( "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s", usr_priv_level, req_priv_level, user, host, prog) raise Exception('Authorisation failure') LOG.info( '[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def _event_email_callback(self, proc_ctx, schd_ctx): """Call back when email notification command exits.""" for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: if proc_ctx.ret_code == 0: del self.event_timers[id_key] log_ctx = SubProcContext((key1, submit_num), None) log_ctx.ret_code = 0 log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) else: self.event_timers[id_key].unset_waiting() except KeyError as exc: LOG.exception(exc)
def put_broadcast( self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"script": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} self._addict( self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def _run_job_cmd(self, cmd_key, suite, itasks, callback): """Run job commands, e.g. poll, kill, etc. Group itasks with their user@host. Put a job command for each user@host to the multiprocess pool. """ if not itasks: return auth_itasks = {} for itask in itasks: if (itask.task_host, itask.task_owner) not in auth_itasks: auth_itasks[(itask.task_host, itask.task_owner)] = [] auth_itasks[(itask.task_host, itask.task_owner)].append(itask) for (host, owner), itasks in sorted(auth_itasks.items()): cmd = ["cylc", cmd_key] if LOG.isEnabledFor(DEBUG): cmd.append("--debug") if is_remote_host(host): cmd.append("--host=%s" % (host)) if is_remote_user(owner): cmd.append("--user=%s" % (owner)) cmd.append("--") cmd.append(glbl_cfg().get_derived_host_item( suite, "suite job log directory", host, owner)) job_log_dirs = [] for itask in sorted(itasks, key=lambda itask: itask.identity): job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) cmd += job_log_dirs self.proc_pool.put_command( SubProcContext(cmd_key, cmd), callback, [suite, itasks])
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ to_kill_tasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() to_kill_tasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, to_kill_tasks, self._kill_task_jobs_callback)
def _get_job_scripts(itask, rtconfig): """Return pre-script, script, post-script for a job.""" script = rtconfig['script'] pre_script = rtconfig['pre-script'] post_script = rtconfig['post-script'] if itask.tdef.suite_polling_cfg: # Automatic suite state polling script comstr = "cylc suite-state " + \ " --task=" + itask.tdef.suite_polling_cfg['task'] + \ " --point=" + str(itask.point) if LOG.isEnabledFor(DEBUG): comstr += ' --debug' for key, fmt in [ ('user', ' --%s=%s'), ('host', ' --%s=%s'), ('interval', ' --%s=%d'), ('max-polls', ' --%s=%s'), ('run-dir', ' --%s=%s')]: if rtconfig['suite state polling'][key]: comstr += fmt % (key, rtconfig['suite state polling'][key]) if rtconfig['suite state polling']['message']: comstr += " --message='%s'" % ( rtconfig['suite state polling']['message']) else: comstr += " --status=" + itask.tdef.suite_polling_cfg['status'] comstr += " " + itask.tdef.suite_polling_cfg['suite'] script = "echo " + comstr + "\n" + comstr return pre_script, script, post_script
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info( '[%s] -poll now, (next in %s)', itask, itask.poll_timer.delay_timeout_as_str()) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys): """Process retrieval of task job logs from remote user@host.""" if ctx.user_at_host and "@" in ctx.user_at_host: s_user, s_host = ctx.user_at_host.split("@", 1) else: s_user, s_host = (None, ctx.user_at_host) ssh_str = str(glbl_cfg().get_host_item("ssh command", s_host, s_user)) rsync_str = str(glbl_cfg().get_host_item( "retrieve job logs command", s_host, s_user)) cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str] if LOG.isEnabledFor(DEBUG): cmd.append("-v") if ctx.max_size: cmd.append("--max-size=%s" % (ctx.max_size,)) # Includes and excludes includes = set() for _, point, name, submit_num in id_keys: # Include relevant directories, all levels needed includes.add("/%s" % (point)) includes.add("/%s/%s" % (point, name)) includes.add("/%s/%s/%02d" % (point, name, submit_num)) includes.add("/%s/%s/%02d/**" % (point, name, submit_num)) cmd += ["--include=%s" % (include) for include in sorted(includes)] cmd.append("--exclude=/**") # exclude everything else # Remote source cmd.append(ctx.user_at_host + ":" + glbl_cfg().get_derived_host_item( schd_ctx.suite, "suite job log directory", s_host, s_user) + "/") # Local target cmd.append(glbl_cfg().get_derived_host_item( schd_ctx.suite, "suite job log directory") + "/") self.proc_pool.put_command( SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys), self._job_logs_retrieval_callback, [schd_ctx])
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("[%s] -Vacated job restarted", itask) self.pflag = True if itask.state.reset_state(TASK_STATUS_RUNNING): self.setup_event_handlers(itask, 'started', 'job started') itask.set_summary_time('started', event_time) self._reset_job_timers(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "time_run": itask.summary['started_time_string']}) # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
async def async_request(self, command, args=None, timeout=None): """Send an asynchronous request using asyncio. Has the same arguments and return values as ``serial_request``. """ if timeout: timeout = float(timeout) timeout = (timeout * 1000 if timeout else None) or self.timeout if not args: args = {} # get secret for this request # assumes secret won't change during the request try: secret = self.secret() except cylc.suite_srv_files_mgr.SuiteServiceFileError: raise ClientError('could not read suite passphrase') # send message msg = {'command': command, 'args': args} msg.update(self.header) LOG.debug('zmq:send %s' % msg) message = encrypt(msg, secret) self.socket.send_string(message) # receive response if self.poller.poll(timeout): res = await self.socket.recv_string() else: if self.timeout_handler: self.timeout_handler() raise ClientTimeout('Timeout waiting for server response.') try: response = decrypt(res, secret) LOG.debug('zmq:recv %s' % response) except jose.exceptions.JWTError: raise ClientError( 'Could not decrypt response. Has the passphrase changed?') try: return response['data'] except KeyError: error = response['error'] raise ClientError(error['message'], error.get('traceback'))
def callback(self, ctx): """Callback for asynchronous xtrigger functions. Record satisfaction status and function results dict. """ LOG.debug(ctx) sig = ctx.get_signature() self.active.remove(sig) try: satisfied, results = json.loads(ctx.out) except (ValueError, TypeError): return LOG.debug('%s: returned %s' % (sig, results)) if satisfied: self.pflag = True self.sat_xtrig[sig] = results
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ( "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)