def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def _upgrade_with_state_file_extras(self): """Upgrade the database tables after reading in state file.""" conn = self.connect() # Rename old tables for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]: conn.execute(r"ALTER TABLE " + t_name + r" RENAME TO " + t_name + "_old") conn.commit() # Create tables with new columns self.create_tables() # Populate new tables using old column data for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]: LOG.info(r"Upgrading %s table", t_name) column_names = [col.name for col in self.tables[t_name].columns] for i, row in enumerate( conn.execute(r"SELECT " + ",".join(column_names) + " FROM " + t_name + "_old")): # These tables can be big, so we don't want to queue the items # in memory. conn.execute(self.tables[t_name].get_insert_stmt(), list(row)) conn.commit() # Drop old tables for t_name in [self.TABLE_TASK_STATES, self.TABLE_TASK_EVENTS]: conn.execute(r"DROP TABLE " + t_name + "_old") conn.commit()
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical("[%s] -job(%02d) %s", itask, itask.submit_num, "failed") else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical( "[%s] -job(%02d) %s", itask, itask.submit_num, "failed") else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) itask.summary['submit_method_id'] = None self.pflag = True if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append( itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED]: msg += "\n " + output if msg: LOG.info( "[%s] -Succeeded with outputs not completed: %s", itask, msg) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def _check_access_priv_and_report(self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.LOG_CONNECT_ALLOWED_TMPL, user, host, prog_name, priv_level, uuid) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.LOG_COMMAND_TMPL, command, user, host, prog_name, uuid) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: LOG.info("LOADING broadcast states") point, namespace, key, value = row sections = [] cur_key = key if "]" in cur_key: sections = self.REC_SECTION.findall(cur_key) cur_key = cur_key.rsplit(r"]", 1)[-1] with self.lock: self.broadcasts.setdefault(point, {}) self.broadcasts[point].setdefault(namespace, {}) dict_ = self.broadcasts[point][namespace] for section in sections: dict_.setdefault(section, {}) dict_ = dict_[section] dict_[cur_key] = value LOG.info( CHANGE_FMT.strip() % { "change": CHANGE_PREFIX_SET, "point": point, "namespace": namespace, "key": key, "value": value })
def _upgrade_with_state_file_header(self, line): """Parse a header line in state file, add information to DB.""" head, tail = line.split(" : ", 1) if head == "time": self.add_insert_item( self.TABLE_CHECKPOINT_ID, { "id": self.CHECKPOINT_LATEST_ID, "time": tail.split(" ", 1)[0], "event": self.CHECKPOINT_LATEST_EVENT }) return for name, key in [("run mode", "run_mode"), ("initial cycle", "initial_point"), ("final cycle", "final_point")]: if tail == "None": tail = None if head == name: self.add_insert_item(self.TABLE_SUITE_PARAMS, { "key": key, "value": tail }) LOG.info(" + %s=%s", key, tail) if name == "final cycle": return "broadcast" else: return
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append(itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [ TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED ]: msg += "\n " + output if msg: LOG.info("[%s] -Succeeded with outputs not completed: %s", itask, msg) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def clear_broadcast(self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options(self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info(get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def upgrade_from_611(self): """Upgrade database on restart with a 6.11.X private database.""" conn = self.connect() # Add hold_swap column task_pool(_checkpoints) tables for t_name in [self.TABLE_TASK_POOL, self.TABLE_TASK_POOL_CHECKPOINTS]: LOG.info("Add hold_swap column to %s", t_name) conn.execute(r"ALTER TABLE " + t_name + r" ADD COLUMN hold_swap TEXT") conn.commit()
def clear_broadcast( self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def _run_event_handlers_callback(proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def satisfy_xclock(self, itask): """Attempt to satisfy itask's clock trigger, if it has one.""" label, sig, ctx, satisfied = self._get_xclock(itask) if satisfied: return if wall_clock(*ctx.func_args, **ctx.func_kwargs): satisfied = True itask.state.xclock = (label, True) self.sat_xclock.append(sig) LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
def _authorise(self, *args, user='******', meta=None, **kwargs): host = meta.get('host', '?') prog = meta.get('prog', '?') usr_priv_level = self.get_priv_level(user) if usr_priv_level < req_priv_level: LOG.warn( "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s", usr_priv_level, req_priv_level, user, host, prog) raise Exception('Authorisation failure') LOG.info('[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def put_broadcast(self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"script": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} self._addict( self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def test_value_error_raises_system_exit(self, mocked_glbl_cfg): """Test that a ValueError when writing to a log stream won't result in multiple exceptions (what could lead to infinite loop in some occasions. Instead, it **must** raise a SystemExit""" with tempfile.NamedTemporaryFile() as tf: # mock objects used when creating the file handler mocked = mock.MagicMock() mocked_glbl_cfg.return_value = mocked mocked.get_derived_host_item.return_value = tf.name mocked.get.return_value = 100 file_handler = TimestampRotatingFileHandler("suiteA", False) # next line is important as pytest can have a "Bad file descriptor" # due to a FileHandler with default "a" (pytest tries to r/w). file_handler.mode = "a+" # enable the logger LOG.setLevel(logging.INFO) LOG.addHandler(file_handler) # Disable raising uncaught exceptions in logging, due to file # handler using stdin.fileno. See the following links for more. # https://github.com/pytest-dev/pytest/issues/2276 & # https://github.com/pytest-dev/pytest/issues/1585 logging.raiseExceptions = False # first message will initialize the stream and the handler LOG.info("What could go") # here we change the stream of the handler old_stream = file_handler.stream file_handler.stream = mock.MagicMock() file_handler.stream.seek = mock.MagicMock() # in case where file_handler.stream.seek.side_effect = ValueError try: # next call will call the emit method and use the mocked stream LOG.info("wrong?!") self.fail("Exception SystemError was not raised") except SystemExit: pass finally: # clean up file_handler.stream = old_stream # for log_handler in LOG.handlers: # log_handler.close() file_handler.close() LOG.removeHandler(file_handler) logging.raiseExceptions = True
def _authorise(self, *args, user='******', meta=None, **kwargs): if not meta: meta = {} host = meta.get('host', '?') prog = meta.get('prog', '?') usr_priv_level = self._get_priv_level(user) if usr_priv_level < req_priv_level: LOG.warn( "[client-connect] DENIED (privilege '%s' < '%s') %s@%s:%s", usr_priv_level, req_priv_level, user, host, prog) raise Exception('Authorisation failure') LOG.info( '[client-command] %s %s@%s:%s', fcn.__name__, user, host, prog) return fcn(self, *args, **kwargs)
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info('[%s] -poll now, (next in %s)', itask, itask.poll_timer.delay_timeout_as_str()) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def put_broadcast( self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"script": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} self._addict( self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def upgrade_with_state_file(self, state_file_path): """Upgrade database on restart with an old state file. Upgrade database from a state file generated by a suite that ran with an old cylc version. """ check_points = [] self.select_checkpoint_id( lambda row_idx, row: check_points.append(row), self.CHECKPOINT_LATEST_ID) if check_points: # No need to upgrade if latest check point already exists return LOG.info("Upgrading suite db with %s ...", state_file_path) self._upgrade_with_state_file_states(state_file_path) self._upgrade_with_state_file_extras()
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info( '[%s] -poll now, (next in %s)', itask, itask.poll_timer.delay_timeout_as_str()) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def _upgrade_with_state_file_tasks(self, line): """Parse a task state line in state file, add information to DB.""" head, tail = line.split(" : ", 1) name, cycle = head.split(".") status = None spawned = None for item in tail.split(", "): key, value = item.split("=", 1) if key == "status": status = value elif key == "spawned": spawned = int(value in ["True", "true"]) self.add_insert_item(self.TABLE_TASK_POOL, { "name": name, "cycle": cycle, "spawned": spawned, "status": status, "hold_swap": None}) LOG.info(" + %s", head)
def _upgrade_with_state_file_states(self, state_file_path): """Helper for self.upgrade_with_state_file(). Populate the new database tables with information from state file. """ location = None LOG.info("Populating %s table", self.TABLE_SUITE_PARAMS) for line in open(state_file_path): line = line.strip() if location is None: # run mode, time stamp, initial cycle, final cycle location = self._upgrade_with_state_file_header(line) elif location == "broadcast": # Ignore broadcast json in state file. # The "broadcast_states" table should already be populated. if line == "Begin task states": location = "task states" LOG.info("Populating %s table", self.TABLE_TASK_POOL) else: self._upgrade_with_state_file_tasks(line) self.execute_queued_items()
def _process_message_submitted(self, itask, event_time): """Helper for process_message, handle a submit-succeeded message.""" try: LOG.info( '[%s] -job[%02d] submitted to %s:%s[%s]', itask, itask.summary['submit_num'], itask.summary['host'], itask.summary['batch_sys_name'], itask.summary['submit_method_id']) except KeyError: pass self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 0, "batch_sys_job_id": itask.summary.get('submit_method_id')}) if itask.tdef.run_mode == 'simulation': # Simulate job execution at this point. itask.set_summary_time('submitted', event_time) itask.set_summary_time('started', event_time) itask.state.reset_state(TASK_STATUS_RUNNING) itask.state.outputs.set_completion(TASK_OUTPUT_STARTED, True) return itask.set_summary_time('submitted', event_time) # Unset started and finished times in case of resubmission. itask.set_summary_time('started') itask.set_summary_time('finished') itask.set_summary_message(TASK_OUTPUT_SUBMITTED) self.pflag = True if itask.state.status == TASK_STATUS_READY: # The job started message can (rarely) come in before the submit # command returns - in which case do not go back to 'submitted'. if itask.state.reset_state(TASK_STATUS_SUBMITTED): self.setup_event_handlers( itask, TASK_OUTPUT_SUBMITTED, 'job submitted') self._reset_job_timers(itask)
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log(suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.exception(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log( suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write((ctx_str + '\n').encode()) except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.exception(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def verify_triggering(self): new = self.get_triggered(self.new_loglines) ref = self.get_triggered(self.ref_loglines) if len(new) == 0: raise LogAnalyserError( "ERROR: new log contains no triggering info.") if len(ref) == 0: raise LogAnalyserError( "ERROR: reference log contains no triggering info.") new.sort() ref.sort() if new != ref: diff = unified_diff(new, ref) raise LogAnalyserError( "ERROR: triggering is NOT consistent with the reference log:" + '\n' + '\n'.join(diff) + '\n') else: LOG.info( "LogAnalyser: triggering is consistent with the reference log")
def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: LOG.info("LOADING broadcast states") point, namespace, key, value = row sections = [] cur_key = key if "]" in cur_key: sections = self.REC_SECTION.findall(cur_key) cur_key = cur_key.rsplit(r"]", 1)[-1] with self.lock: self.broadcasts.setdefault(point, {}) self.broadcasts[point].setdefault(namespace, {}) dict_ = self.broadcasts[point][namespace] for section in sections: dict_.setdefault(section, {}) dict_ = dict_[section] dict_[cur_key] = value LOG.info(CHANGE_FMT.strip() % { "change": CHANGE_PREFIX_SET, "point": point, "namespace": namespace, "key": key, "value": value})
def _reset_job_timers(self, itask): """Set up poll timer and timeout for task.""" if itask.state.status not in TASK_STATUSES_ACTIVE: # Reset, task not active itask.timeout = None itask.poll_timer = None return ctx = (itask.submit_num, itask.state.status) if itask.poll_timer and itask.poll_timer.ctx == ctx: return # Set poll timer # Set timeout timeref = None # reference time, submitted or started time timeout = None # timeout in setting if itask.state.status == TASK_STATUS_RUNNING: timeref = itask.summary['started_time'] timeout_key = 'execution timeout' timeout = self._get_events_conf(itask, timeout_key) delays = list(self.get_host_conf( itask, 'execution polling intervals', skey='job', default=[900])) # Default 15 minute intervals if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT] try: host_conf = self.get_host_conf(itask, 'batch systems') batch_sys_conf = host_conf[itask.summary['batch_sys_name']] except (TypeError, KeyError): batch_sys_conf = {} time_limit_delays = batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]) timeout = time_limit + sum(time_limit_delays) # Remove excessive polling before time limit while sum(delays) > time_limit: del delays[-1] # But fill up the gap before time limit if delays: size = int((time_limit - sum(delays)) / delays[-1]) delays.extend([delays[-1]] * size) time_limit_delays[0] += time_limit - sum(delays) delays += time_limit_delays else: # if itask.state.status == TASK_STATUS_SUBMITTED: timeref = itask.summary['submitted_time'] timeout_key = 'submission timeout' timeout = self._get_events_conf(itask, timeout_key) delays = list(self.get_host_conf( itask, 'submission polling intervals', skey='job', default=[900])) # Default 15 minute intervals try: itask.timeout = timeref + float(timeout) timeout_str = intvl_as_str(timeout) except (TypeError, ValueError): itask.timeout = None timeout_str = None itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays) # Log timeout and polling schedule message = 'health check settings: %s=%s' % (timeout_key, timeout_str) # Attempt to group identical consecutive delays as N*DELAY,... if itask.poll_timer.delays: items = [] # [(number of item - 1, item), ...] for delay in itask.poll_timer.delays: if items and items[-1][1] == delay: items[-1][0] += 1 else: items.append([0, delay]) message += ', polling intervals=' for num, item in items: if num: message += '%d*' % (num + 1) message += '%s,' % intvl_as_str(item) message += '...' LOG.info('[%s] -%s', itask, message) # Set next poll time self.check_poll_time(itask)
def load_xtrigger_for_restart(self, row_idx, row): """Load satisfied xtrigger results from suite DB.""" if row_idx == 0: LOG.info("LOADING satisfied xtriggers") sig, results = row self.sat_xtrig[sig] = json.loads(results)
def upgrade_pickle_to_json(self): """Upgrade the database tables if containing pickled objects. Back compat for <=7.6.X. """ conn = self.connect() t_name = self.TABLE_TASK_ACTION_TIMERS if "_pickle" not in self.select_table_schema("table", t_name): return # Rename old tables conn.execute(r"ALTER TABLE %(table)s RENAME TO %(table)s_old" % {"table": t_name}) conn.commit() # Create tables with new columns self.create_tables() # Populate new tables using old column data # Codacy: Pickle library appears to be in use, possible security issue. # Use of "pickle" module is for loading data written by <=7.6.X of Cylc # in users' own spaces. import pickle LOG.info(r"Upgrading %s table", t_name) cols = [] for col in self.tables[t_name].columns: if col.name in ['ctx_key', 'ctx', 'delays']: cols.append(col.name + '_pickle') else: cols.append(col.name) # Codacy: Possible SQL injection vector through string-based query # construction. # This is highly unlikely - all strings in the construct are from # constants in this module. for i, row in enumerate( conn.execute(r"SELECT " + ",".join(cols) + " FROM " + t_name + "_old")): args = [] try: for col, cell in zip(cols, row): if col == "ctx_pickle": # Upgrade pickled namedtuple objects orig = pickle.loads(str(cell)) if orig is not None: args.append( json.dumps([ type(orig).__name__, orig.__getnewargs__() ])) else: args.append(json.dumps(orig)) elif col.endswith("_pickle"): # Upgrade pickled lists args.append(json.dumps(pickle.loads(str(cell)))) else: args.append(cell) except (EOFError, TypeError, LookupError, ValueError): pass else: # These tables can be big, so we don't want to queue the items # in memory. conn.execute(self.tables[t_name].get_insert_stmt(), args) conn.commit() # Drop old tables conn.execute(r"DROP TABLE %(table)s_old" % {"table": t_name}) conn.commit()
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def _reset_job_timers(self, itask): """Set up poll timer and timeout for task.""" if itask.state.status not in TASK_STATUSES_ACTIVE: # Reset, task not active itask.timeout = None itask.poll_timer = None return ctx = (itask.submit_num, itask.state.status) if itask.poll_timer and itask.poll_timer.ctx == ctx: return # Set poll timer # Set timeout timeref = None # reference time, submitted or started time timeout = None # timeout in setting if itask.state.status == TASK_STATUS_RUNNING: timeref = itask.summary['started_time'] timeout_key = 'execution timeout' timeout = self._get_events_conf(itask, timeout_key) delays = list( self.get_host_conf(itask, 'execution polling intervals', skey='job', default=[900 ])) # Default 15 minute intervals if itask.summary[self.KEY_EXECUTE_TIME_LIMIT]: time_limit = itask.summary[self.KEY_EXECUTE_TIME_LIMIT] try: host_conf = self.get_host_conf(itask, 'batch systems') batch_sys_conf = host_conf[itask.summary['batch_sys_name']] except (TypeError, KeyError): batch_sys_conf = {} time_limit_delays = batch_sys_conf.get( 'execution time limit polling intervals', [60, 120, 420]) timeout = time_limit + sum(time_limit_delays) # Remove excessive polling before time limit while sum(delays) > time_limit: del delays[-1] # But fill up the gap before time limit if delays: size = int((time_limit - sum(delays)) / delays[-1]) delays.extend([delays[-1]] * size) time_limit_delays[0] += time_limit - sum(delays) delays += time_limit_delays else: # if itask.state.status == TASK_STATUS_SUBMITTED: timeref = itask.summary['submitted_time'] timeout_key = 'submission timeout' timeout = self._get_events_conf(itask, timeout_key) delays = list( self.get_host_conf(itask, 'submission polling intervals', skey='job', default=[900 ])) # Default 15 minute intervals try: itask.timeout = timeref + float(timeout) timeout_str = intvl_as_str(timeout) except (TypeError, ValueError): itask.timeout = None timeout_str = None itask.poll_timer = TaskActionTimer(ctx=ctx, delays=delays) # Log timeout and polling schedule message = 'health check settings: %s=%s' % (timeout_key, timeout_str) # Attempt to group identical consecutive delays as N*DELAY,... if itask.poll_timer.delays: items = [] # [(number of item - 1, item), ...] for delay in itask.poll_timer.delays: if items and items[-1][1] == delay: items[-1][0] += 1 else: items.append([0, delay]) message += ', polling intervals=' for num, item in items: if num: message += '%d*' % (num + 1) message += '%s,' % intvl_as_str(item) message += '...' LOG.info('[%s] -%s', itask, message) # Set next poll time self.check_poll_time(itask)
def _run_event_mail_callback(proc_ctx): """Callback the mail command for notification of a suite event.""" if proc_ctx.ret_code: LOG.warning(str(proc_ctx)) else: LOG.info(str(proc_ctx))