def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def _report_connection_if_denied(self): """Log an (un?)successful connection attempt.""" prog_name, user, host, uuid = _get_client_info()[1:] connection_denied = self._get_client_connection_denied() if connection_denied: LOG.warning(self.__class__.LOG_CONNECT_DENIED_TMPL % ( user, host, prog_name, uuid))
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append(itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [ TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED ]: msg += "\n " + output if msg: LOG.info("Succeeded with outputs not completed: %s" % msg, itask=itask) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): user_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): user_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): user_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: user_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = self.task_events_mgr.get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write(user_at_host + line) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning(user_at_host + line, itask=itask)
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return o_status, o_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() flags.iflag = True # Log message = str(o_status) if o_hold_swap: message += " (%s)" % o_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug(message, itask=self.identity)
def check_job_time(self, itask, now): """Check/handle job timeout and poll timer""" can_poll = self.check_poll_time(itask, now) if itask.timeout is None or now <= itask.timeout: return can_poll # Timeout reached for task, emit event and reset itask.timeout if itask.state.status == TASK_STATUS_RUNNING: time_ref = itask.summary['started_time'] event = 'execution timeout' elif itask.state.status == TASK_STATUS_SUBMITTED: time_ref = itask.summary['submitted_time'] event = 'submission timeout' msg = event try: msg += ' after %s' % intvl_as_str(itask.timeout - time_ref) except (TypeError, ValueError): # Badness in time_ref? pass itask.timeout = None # emit event only once if msg and event: LOG.warning(msg, itask=itask) self.setup_event_handlers(itask, event, msg) return True else: return can_poll
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("Vacated job restarted", itask=itask) self.pflag = True itask.state.reset_state(TASK_STATUS_RUNNING) itask.set_summary_time('started', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "time_run": itask.summary['started_time_string']}) if itask.summary['execution_time_limit']: execution_timeout = itask.summary['execution_time_limit'] else: execution_timeout = self._get_events_conf( itask, 'execution timeout') try: itask.timeout_timers[TASK_STATUS_RUNNING] = ( itask.summary['started_time'] + float(execution_timeout)) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_RUNNING] = None # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 self.setup_event_handlers(itask, 'started', 'job started') self.set_poll_time(itask)
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return o_status, o_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif status in TASK_STATUSES_ACTIVE: if self.status == TASK_STATUS_HELD: self.hold_swap = TASK_STATUS_HELD elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() flags.iflag = True # Log message = str(o_status) if o_hold_swap: message += " (%s)" % o_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug(message, itask=self.identity)
def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ poll_me = [] pollable = [ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED ] for itask in itasks: if itask.state.status in pollable or ( itask.state.status == TASK_STATUS_SUCCEEDED and poll_succ): poll_me.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if poll_me: if msg is not None: LOG.info(msg) self._run_job_cmd(self.JOBS_POLL, suite, poll_me, self._poll_task_jobs_callback)
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical("job(%02d) %s" % (itask.submit_num, "failed"), itask=itask) else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flags.debug: traceback.print_exc() err_log = ("cannot execute database statement:\n" "file=%(file)s:\nstmt=%(stmt)s") % { "file": self.db_file_name, "stmt": stmt } for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args }) LOG.warning(err_log) raise
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("Vacated job restarted", itask=itask) self.pflag = True itask.state.reset_state(TASK_STATUS_RUNNING) itask.set_event_time('started', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "time_run": itask.summary['started_time_string']}) if itask.summary['execution_time_limit']: execution_timeout = itask.summary['execution_time_limit'] else: execution_timeout = self._get_events_conf( itask, 'execution timeout') try: itask.timeout_timers[TASK_STATUS_RUNNING] = ( itask.summary['started_time'] + float(execution_timeout)) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_RUNNING] = None # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 self.setup_event_handlers(itask, 'started', 'job started') self.set_poll_time(itask)
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error(self.EVENT_SUBMIT_FAILED, itask=itask) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def report(self, request, server_obj): """Log client requests with identifying information. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info request from each client. """ if threading.current_thread().__class__.__name__ == '_MainThread': # Server methods may be called internally as well as by clients. return auth_user, prog_name, user, host, uuid, priv_level = get_client_info() name = server_obj.__class__.__name__ log_me = ( cylc.flags.debug or name in ["SuiteCommandServer", "ExtTriggerServer", "BroadcastServer"] or (name not in ["SuiteIdServer", "TaskMessageServer"] and uuid not in self.clients)) if log_me: LOG.debug( self.__class__.LOG_CONNECT_ALLOWED_TMPL % ( user, host, prog_name, priv_level, uuid) ) LOG.info( self.__class__.LOG_COMMAND_TMPL % ( request, user, host, prog_name, uuid)) if name == "SuiteIdServer": self._num_id_requests += 1 self.report_id_requests() self.clients[uuid] = datetime.datetime.utcnow() self._housekeep()
def _check_access_priv_and_report(self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.__class__.LOG_CONNECT_ALLOWED_TMPL % (user, host, prog_name, priv_level, uuid)) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.__class__.LOG_COMMAND_TMPL % (command, user, host, prog_name, uuid)) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def _process_message_succeeded(self, itask, event_time): """Helper for process_message, handle a succeeded message.""" self.pflag = True itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 0, "time_run_exit": event_time, }) # Update mean elapsed time only on task succeeded. if itask.summary['started_time'] is not None: itask.tdef.elapsed_times.append( itask.summary['finished_time'] - itask.summary['started_time']) if not itask.state.outputs.all_completed(): msg = "" for output in itask.state.outputs.get_not_completed(): if output not in [TASK_OUTPUT_EXPIRED, TASK_OUTPUT_SUBMIT_FAILED, TASK_OUTPUT_FAILED]: msg += "\n " + output if msg: LOG.info("Succeeded with outputs not completed: %s" % msg, itask=itask) if itask.state.reset_state(TASK_STATUS_SUCCEEDED): self.setup_event_handlers(itask, "succeeded", "job succeeded") self._reset_job_timers(itask)
def _process_message_failed(self, itask, event_time, message): """Helper for process_message, handle a failed message.""" if event_time is None: event_time = get_current_time_string() itask.set_summary_time('finished', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "run_status": 1, "time_run_exit": event_time, }) if (TASK_STATUS_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_RETRYING].next() is None): # No retry lined up: definitive failure. self.pflag = True if itask.state.reset_state(TASK_STATUS_FAILED): self.setup_event_handlers(itask, "failed", message) LOG.critical("job(%02d) %s" % ( itask.submit_num, "failed"), itask=itask) else: # There is a retry lined up delay_msg = "retrying in %s" % ( itask.try_timers[TASK_STATUS_RETRYING].delay_timeout_as_str()) msg = "failed, %s" % (delay_msg) LOG.info("job(%02d) %s" % (itask.submit_num, msg), itask=itask) itask.summary['latest_message'] = msg if itask.state.reset_state(TASK_STATUS_RETRYING): self.setup_event_handlers( itask, "retry", "%s, %s" % (self.JOB_FAILED, delay_msg)) self._reset_job_timers(itask)
def _prep_submit_task_job(self, suite, itask, dry_run): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask try: job_conf = self._prep_submit_task_job_impl(suite, itask) local_job_file_path = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE) self.job_file_writer.write(local_job_file_path, job_conf) except Exception, exc: # Could be a bad command template. ERR.error(traceback.format_exc()) LOG.error(traceback.format_exc()) self.task_events_mgr.log_task_job_activity( SuiteProcContext( self.JOBS_SUBMIT, '(prepare job file)', err=exc, ret_code=1), suite, itask.point, itask.tdef.name) if not dry_run: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) return
def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: owner_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = self.task_events_mgr.get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write(owner_at_host + line) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning(owner_at_host + line, itask=itask)
def load_db_broadcast_states(self, row_idx, row): """Load broadcast variables from runtime DB broadcast states row.""" if row_idx == 0: LOG.info("LOADING broadcast states") point, namespace, key, value = row sections = [] cur_key = key if "]" in cur_key: sections = self.REC_SECTION.findall(cur_key) cur_key = cur_key.rsplit(r"]", 1)[-1] with self.lock: self.broadcasts.setdefault(point, {}) self.broadcasts[point].setdefault(namespace, {}) dict_ = self.broadcasts[point][namespace] for section in sections: dict_.setdefault(section, {}) dict_ = dict_[section] dict_[cur_key] = value LOG.info( CHANGE_FMT.strip() % { "change": CHANGE_PREFIX_SET, "point": point, "namespace": namespace, "key": key, "value": value })
def _prep_submit_task_job(self, suite, itask, dry_run): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask try: job_conf = self._prep_submit_task_job_impl(suite, itask) local_job_file_path = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE) self.job_file_writer.write(local_job_file_path, job_conf) except Exception, exc: # Could be a bad command template. LOG.error(traceback.format_exc()) self.task_events_mgr.log_task_job_activity( SuiteProcContext( self.JOBS_SUBMIT, '(prepare job file)', err=exc, ret_code=1), suite, itask.point, itask.tdef.name) if not dry_run: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, self.poll_task_jobs) return
def _remote_init_items(self, host, owner): """Return list of items that should be installed on task remote. Each item is (path, name), where name is relative path under suite run directory. """ items = [] comm_meth = glbl_cfg().get_host_item('task communication method', host, owner) LOG.debug('comm_meth=%s' % comm_meth) if comm_meth in ['ssh', 'http', 'https']: # Contact file items.append( (self.suite_srv_files_mgr.get_contact_file(self.suite), os.path.join(self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT))) if comm_meth in ['http', 'https']: # Passphrase file items.append( (self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE, self.suite), os.path.join( self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_PASSPHRASE))) if comm_meth in ['https']: # SSL cert file items.append( (self.suite_srv_files_mgr.get_auth_item( self.suite_srv_files_mgr.FILE_BASE_SSL_CERT, self.suite), os.path.join( self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_SSL_CERT))) return items
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(os.path.join(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err))
def __init__(self, pool_size=None): self.pool_size = (pool_size or GLOBAL_CFG.get(["process pool size"]) or multiprocessing.cpu_count()) # (The Pool class defaults to cpu_count anyway, but does not # expose the result via its public interface). LOG.debug("Initializing process pool, size %d" % self.pool_size) self.pool = multiprocessing.Pool(processes=self.pool_size) self.results = {}
def clear_broadcast(self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options(self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info(get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def recover_pub_from_pri(self): """Recover public database from private database.""" if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES: self.copy_pri_to_pub() LOG.warning( "%(pub_db_name)s: recovered from %(pri_db_name)s" % { "pub_db_name": self.pub_dao.db_file_name, "pri_db_name": self.pri_dao.db_file_name}) self.pub_dao.n_tries = 0
def _housekeep(self): """Forget inactive clients.""" for uuid, client_info in self.clients.copy().items(): if time() - client_info['time'] > self.CLIENT_FORGET_SEC: try: del self.clients[uuid] except KeyError: pass LOG.debug(self.LOG_FORGET_TMPL % uuid)
def join(self): """Join after workers have exited. Close or terminate first.""" LOG.debug("Joining process pool") try: self.pool.join() except AssertionError: # multiprocessing.Pool.join may raise this error. We want to ignore # this so suite shutdown can continue. pass
def put_command(self, ctx, callback, callback_args=None): """Queue a new shell command to execute.""" try: result = self.pool.apply_async(_run_command, [ctx]) except AssertionError as exc: LOG.warning("%s\n %s\n %s" % (str(exc), "Rejecting command (pool closed)", ctx.cmd)) else: self.results[id(result)] = (result, callback, callback_args)
def _prep_submit_task_job(self, suite, itask, dry_run): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask # Handle broadcasts overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = itask.tdef.rtconfig # Determine task host settings now, just before job submission, # because dynamic host selection may be used. try: task_host = self.task_remote_mgr.remote_host_select( rtconfig['remote']['host']) except TaskRemoteMgmtError as exc: # Submit number not yet incremented itask.submit_num += 1 itask.summary['submit_num'] = itask.submit_num self._prep_submit_task_job_error(suite, itask, dry_run, '(remote host select)', exc) return False else: if task_host is None: # host select not ready itask.summary['latest_message'] = self.REMOTE_SELECT_MSG return itask.task_host = task_host try: job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig) local_job_file_path = self.task_events_mgr.get_task_job_log( suite, itask.point, itask.tdef.name, itask.submit_num, self.JOB_FILE_BASE) self.job_file_writer.write(local_job_file_path, job_conf) except StandardError as exc: # Could be a bad command template, IOError, etc self._prep_submit_task_job_error(suite, itask, dry_run, '(prepare job file)', exc) return False itask.local_job_file_path = local_job_file_path if dry_run: # This will be shown next to submit num in gcylc: itask.summary['latest_message'] = 'job file written (edit/dry-run)' LOG.debug(itask.summary['latest_message'], itask=itask) # Return value used by "cylc submit" and "cylc jobscript": return itask
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt(table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % { "file": self.db_file_name, "attempt": self.n_tries }) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % { "file": self.db_file_name, "attempt": self.n_tries }) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def _housekeep(self): """Forget inactive clients.""" for uuid in self.clients.keys(): dtime = self.clients[uuid] if (self._total_seconds(datetime.datetime.utcnow() - dtime) > self.__class__.CLIENT_FORGET_SEC): del self.clients[uuid] LOG.debug( self.__class__.LOG_FORGET_TMPL % uuid)
def _run_event_handlers_callback(self, proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) ERR.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def signout(self): """Forget client, where possible.""" uuid = _get_client_info()[4] try: del self.clients[uuid] except KeyError: return False else: LOG.debug(self.LOG_FORGET_TMPL % uuid) return True
def satisfy_xclock(self, itask): """Attempt to satisfy itask's clock trigger, if it has one.""" label, sig, ctx, satisfied = self._get_xclock(itask) if satisfied: return if wall_clock(*ctx.func_args, **ctx.func_kwargs): satisfied = True itask.state.xclock = (label, True) self.sat_xclock.append(sig) LOG.info('clock xtrigger satisfied: %s = %s' % (label, str(ctx)))
def __init__(self, pool_size=None): self.pool_size = ( pool_size or GLOBAL_CFG.get(["process pool size"]) or multiprocessing.cpu_count()) # (The Pool class defaults to cpu_count anyway, but does not # expose the result via its public interface). LOG.debug( "Initializing process pool, size %d" % self.pool_size) self.pool = multiprocessing.Pool(processes=self.pool_size) self.results = {}
def put_command(self, ctx, callback, callback_args=None): """Queue a new shell command to execute.""" try: result = self.pool.apply_async(_run_command, [ctx]) except AssertionError as exc: LOG.warning("%s\n %s\n %s" % ( str(exc), "Rejecting command (pool closed)", ctx.cmd)) else: self.results[id(result)] = (result, callback, callback_args)
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt(table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % {"file": self.db_file_name, "attempt": self.n_tries} ) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % {"file": self.db_file_name, "attempt": self.n_tries} ) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def poll_task_jobs(self, suite, itasks, warn_skips=False): """Poll jobs of specified tasks.""" active_itasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: active_itasks.append(itask) elif warn_skips: # and not active LOG.warning( '%s: skip poll, task not pollable' % itask.identity) self._run_job_cmd( self.JOBS_POLL, suite, active_itasks, self._poll_task_jobs_callback)
def _housekeep(self): """Forget inactive clients.""" for uuid, dtime in self.clients.copy().items(): if (self._total_seconds(datetime.datetime.utcnow() - dtime) > self.__class__.CLIENT_FORGET_SEC): try: del self.clients[uuid] except KeyError: pass LOG.debug( self.__class__.LOG_FORGET_TMPL % uuid)
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and (ctx.event in self.get_events_conf( config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'suite': quote(ctx.suite), 'message': quote(ctx.reason), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SuiteProcContext(cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.is_closed(): # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command(proc_ctx, self._run_event_handlers_callback)
def signout(self, server_obj): """Force forget this client (for use by GUI etc.).""" caller = server_obj.getLocalStorage().caller LOG.info( self.__class__.LOG_SIGNOUT_TMPL % ( caller.user, caller.host, caller.prog_name, caller.uuid)) try: del self.clients[caller.uuid] except KeyError: # Already forgotten. pass self._housekeep()
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def signout(self, server_obj): """Force forget this client (for use by GUI etc.).""" caller = server_obj.getLocalStorage().caller LOG.info( self.__class__.LOG_SIGNOUT_TMPL % ( caller.user, caller.host, caller.prog_name, caller.uuid)) try: del self.clients[caller.uuid] except: # Already forgotten. pass self._housekeep()
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and ( ctx.event in self.get_events_conf(config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'suite': quote(ctx.suite), 'message': quote(ctx.reason), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % ( handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SuiteProcContext( cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command( proc_ctx, self._run_event_handlers_callback)
def put_broadcast(self, point_strings=None, namespaces=None, settings=None): """Add new broadcast settings (server side interface). Return a tuple (modified_settings, bad_options) where: modified_settings is list of modified settings in the form: [("20200202", "foo", {"command scripting": "true"}, ...] bad_options is as described in the docstring for self.clear(). """ modified_settings = [] bad_point_strings = [] bad_namespaces = [] with self.lock: for setting in settings: for point_string in point_strings: # Standardise the point and check its validity. bad_point = False try: point_string = standardise_point_string(point_string) except PointParsingError: if point_string != '*': bad_point_strings.append(point_string) bad_point = True if not bad_point and point_string not in self.broadcasts: self.broadcasts[point_string] = {} for namespace in namespaces: if namespace not in self.linearized_ancestors: bad_namespaces.append(namespace) elif not bad_point: if namespace not in self.broadcasts[point_string]: self.broadcasts[point_string][namespace] = {} self._addict( self.broadcasts[point_string][namespace], setting) modified_settings.append( (point_string, namespace, setting)) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings) LOG.info(get_broadcast_change_report(modified_settings)) bad_options = {} if bad_point_strings: bad_options["point_strings"] = bad_point_strings if bad_namespaces: bad_options["namespaces"] = bad_namespaces return modified_settings, bad_options
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ to_kill_tasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() to_kill_tasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, to_kill_tasks, self._kill_task_jobs_callback)
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("Vacated job restarted", itask=itask) self.pflag = True if itask.state.reset_state(TASK_STATUS_RUNNING): self.setup_event_handlers(itask, 'started', 'job started') itask.set_summary_time('started', event_time) self._reset_job_timers(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "time_run": itask.summary['started_time_string']}) # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
def check_task_jobs(self, suite, task_pool): """Check submission and execution timeout and polling timers. Poll tasks that have timed out and/or have reached next polling time. """ now = time() poll_tasks = set() for itask in task_pool.get_tasks(): if self.task_events_mgr.check_job_time(itask, now): poll_tasks.add(itask) if itask.poll_timer.delay is not None: LOG.info( 'poll now, (next in %s)' % ( itask.poll_timer.delay_timeout_as_str()), itask=itask) if poll_tasks: self.poll_task_jobs(suite, poll_tasks)
def report_connection_if_denied(self): """Log an (un?)successful connection attempt.""" try: (auth_user, prog_name, user, host, uuid, priv_level) = get_client_info() except Exception: LOG.warning( self.__class__.LOG_CONNECT_DENIED_TMPL % ( "unknown", "unknown", "unknown", "unknown") ) return connection_denied = get_client_connection_denied() if connection_denied: LOG.warning( self.__class__.LOG_CONNECT_DENIED_TMPL % ( user, host, prog_name, uuid) )
def kill_task_jobs(self, suite, itasks, warn_skips=False): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ active_itasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() active_itasks.append(itask) elif warn_skips: # and not active LOG.warning( '%s: skip kill, task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, active_itasks, self._kill_task_jobs_callback)