def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs( itask, { "time_submit_exit": get_current_time_string(), "submit_status": 1, }) itask.summary['submit_method_id'] = None if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. self.pflag = True # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers(itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def load_config(self): """Load the suite config.""" if self.suiterc: is_reload = True collapsed = self.suiterc.closed_families else: is_reload = False collapsed = [] try: self.suiterc = SuiteConfig( self.suite, self.file, self.template_vars, is_reload=is_reload, collapsed=collapsed, cli_initial_point_string=self.start_point_string, vis_start_string=self.start_point_string, vis_stop_string=self.stop_point_string) except Exception as exc: msg = "Failed - parsing error?\n\n%s" % exc LOG.error(msg) if self.interactive: dia = gtk.MessageDialog(type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, message_format=msg) dia.run() dia.destroy() return False sys.exit(1) self.inherit = self.suiterc.get_parent_lists() return True
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) run_d = glbl_cfg().get_host_item('run directory') results = [] for dirpath, dnames, fnames in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a: # * .service/ # * cylc-suite.db: (pre-cylc-7 suites don't have ".service/"). if dirpath != run_d and (self.DIR_BASE_SRV in dnames or "cylc-suite.db" in fnames): dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self.get_suite_title(reg) ]) except (IOError, SuiteServiceFileError) as exc: LOG.error('%s: %s', reg, exc) return results
def _process_message_submit_failed(self, itask, event_time): """Helper for process_message, handle a submit-failed message.""" LOG.error('[%s] -%s', itask, self.EVENT_SUBMIT_FAILED) if event_time is None: event_time = get_current_time_string() self.suite_db_mgr.put_update_task_jobs(itask, { "time_submit_exit": event_time, "submit_status": 1, }) itask.summary['submit_method_id'] = None self.pflag = True if (TASK_STATUS_SUBMIT_RETRYING not in itask.try_timers or itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].next() is None): # No submission retry lined up: definitive failure. # See github #476. if itask.state.reset_state(TASK_STATUS_SUBMIT_FAILED): self.setup_event_handlers( itask, self.EVENT_SUBMIT_FAILED, 'job %s' % self.EVENT_SUBMIT_FAILED) else: # There is a submission retry lined up. timer = itask.try_timers[TASK_STATUS_SUBMIT_RETRYING] delay_msg = "submit-retrying in %s" % timer.delay_timeout_as_str() msg = "%s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg) LOG.info("[%s] -job(%02d) %s", itask, itask.submit_num, msg) itask.set_summary_message(msg) if itask.state.reset_state(TASK_STATUS_SUBMIT_RETRYING): self.setup_event_handlers( itask, self.EVENT_SUBMIT_RETRY, "job %s, %s" % (self.EVENT_SUBMIT_FAILED, delay_msg)) self._reset_job_timers(itask)
def list_suites(self, regfilter=None): """Return a filtered list of valid suite registrations.""" rec_regfilter = None if regfilter: try: rec_regfilter = re.compile(regfilter) except re.error as exc: raise ValueError("%s: %s" % (regfilter, exc)) run_d = glbl_cfg().get_host_item('run directory') results = [] for dirpath, dnames, _ in os.walk(run_d, followlinks=True): # Always descend for top directory, but # don't descend further if it has a .service/ dir if dirpath != run_d and self.DIR_BASE_SRV in dnames: dnames[:] = [] # Choose only suites with .service and matching filter reg = os.path.relpath(dirpath, run_d) path = os.path.join(dirpath, self.DIR_BASE_SRV) if (not self._locate_item(self.FILE_BASE_SOURCE, path) or rec_regfilter and not rec_regfilter.search(reg)): continue try: results.append([ reg, self.get_suite_source_dir(reg), self.get_suite_title(reg)]) except (IOError, SuiteServiceFileError) as exc: LOG.error('%s: %s', reg, exc) return results
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity(SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def clear_broadcast(self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options(self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info(get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def clear_broadcast( self, point_strings=None, namespaces=None, cancel_settings=None): """Clear broadcasts globally, or for listed namespaces and/or points. Return a tuple (modified_settings, bad_options), where: * modified_settings is similar to the return value of the "put" method, but for removed broadcasts. * bad_options is a dict in the form: {"point_strings": ["20020202", ..."], ...} The dict is only populated if there are options not associated with previous broadcasts. The keys can be: * point_strings: a list of bad point strings. * namespaces: a list of bad namespaces. * cancel: a list of tuples. Each tuple contains the keys of a bad setting. """ # If cancel_settings defined, only clear specific broadcasts cancel_keys_list = self._settings_to_keys_list(cancel_settings) # Clear broadcasts modified_settings = [] with self.lock: for point_string, point_string_settings in self.broadcasts.items(): if point_strings and point_string not in point_strings: continue for namespace, namespace_settings in ( point_string_settings.items()): if namespaces and namespace not in namespaces: continue stuff_stack = [([], namespace_settings)] while stuff_stack: keys, stuff = stuff_stack.pop() for key, value in stuff.items(): if isinstance(value, dict): stuff_stack.append((keys + [key], value)) elif (not cancel_keys_list or keys + [key] in cancel_keys_list): stuff[key] = None setting = {key: value} for rkey in reversed(keys): setting = {rkey: setting} modified_settings.append( (point_string, namespace, setting)) # Prune any empty branches bad_options = self._get_bad_options( self._prune(), point_strings, namespaces, cancel_keys_list) # Log the broadcast self.suite_db_mgr.put_broadcast(modified_settings, is_cancel=True) LOG.info( get_broadcast_change_report(modified_settings, is_cancel=True)) if bad_options: LOG.error(get_broadcast_bad_options_report(bad_options)) return (modified_settings, bad_options)
def _run_event_handlers_callback(proc_ctx, abort_on_error=False): """Callback on completion of a suite event handler.""" if proc_ctx.ret_code: msg = '%s EVENT HANDLER FAILED' % proc_ctx.cmd_key[1] LOG.error(str(proc_ctx)) LOG.error(msg) if abort_on_error: raise SuiteEventError(msg) else: LOG.info(str(proc_ctx))
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ( "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ( "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def _run_event_custom_handlers(self, config, ctx): """Helper for "run_event_handlers", custom event handlers.""" # Look for event handlers # 1. Handlers for specific event # 2. General handlers handlers = self.get_events_conf(config, '%s handler' % ctx.event) if not handlers and (ctx.event in self.get_events_conf( config, 'handler events', [])): handlers = self.get_events_conf(config, 'handlers') if not handlers: return for i, handler in enumerate(handlers): cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event) # Handler command may be a string for substitution abort_on_error = self.get_events_conf( config, 'abort if %s handler fails' % ctx.event) try: handler_data = { 'event': quote(ctx.event), 'message': quote(ctx.reason), 'suite': quote(ctx.suite), 'suite_uuid': quote(str(ctx.uuid_str)), } if config.cfg['meta']: for key, value in config.cfg['meta'].items(): if key == "URL": handler_data["suite_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s bad template: %s" % (cmd_key, exc) LOG.error(message) if abort_on_error: raise SuiteEventError(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite, ctx.reason) proc_ctx = SubProcContext(cmd_key, cmd, env=dict(os.environ), shell=True) if abort_on_error or self.proc_pool.closed: # Run command in foreground if abort on failure is set or if # process pool is closed self.proc_pool.run_command(proc_ctx) self._run_event_handlers_callback( proc_ctx, abort_on_error=abort_on_error) else: # Run command using process pool otherwise self.proc_pool.put_command(proc_ctx, self._run_event_handlers_callback)
def restart_upgrade(self): """Vacuum/upgrade runtime DB on restart.""" # Backward compat, upgrade database with state file if necessary suite_run_d = os.path.dirname(os.path.dirname(self.pub_path)) old_pri_db_path = os.path.join(suite_run_d, 'state', CylcSuiteDAO.OLD_DB_FILE_BASE_NAME) old_pri_db_path_611 = os.path.join( suite_run_d, CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[0]) old_state_file_path = os.path.join(suite_run_d, "state", "state") if (os.path.exists(old_pri_db_path) and os.path.exists(old_state_file_path) and not os.path.exists(self.pri_path)): # Upgrade pre-6.11.X runtime database + state file copy(old_pri_db_path, self.pri_path) pri_dao = self.get_pri_dao() pri_dao.upgrade_with_state_file(old_state_file_path) target = os.path.join(suite_run_d, "state.tar.gz") cmd = ["tar", "-C", suite_run_d, "-czf", target, "state"] if call(cmd, stdin=open(os.devnull)) == 0: rmtree(os.path.join(suite_run_d, "state"), ignore_errors=True) else: try: os.unlink(os.path.join(suite_run_d, "state.tar.gz")) except OSError: pass LOG.error("cannot tar-gzip + remove old state/ directory") # Remove old files as well try: os.unlink(os.path.join(suite_run_d, "cylc-suite-env")) except OSError: pass elif (os.path.exists(old_pri_db_path_611) and not os.path.exists(self.pri_path)): # Upgrade 6.11.X runtime database os.rename(old_pri_db_path_611, self.pri_path) pri_dao = self.get_pri_dao() pri_dao.upgrade_from_611() # Remove old files as well for name in [ CylcSuiteDAO.OLD_DB_FILE_BASE_NAME_611[1], "cylc-suite-env" ]: try: os.unlink(os.path.join(suite_run_d, name)) except OSError: pass else: pri_dao = self.get_pri_dao() pri_dao.upgrade_pickle_to_json() # Vacuum the primary/private database file pri_dao.vacuum() pri_dao.close()
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
def __init__(self, suite): # Suite only needed for back-compat with old clients (see below): self.suite = suite self.engine = None self.port = None # Figure out the ports we are allowed to use. self.ok_ports = glbl_cfg().get(['suite servers', 'run ports']) random.shuffle(self.ok_ports) comms_options = glbl_cfg().get(['communication', 'options']) # HTTP Digest Auth uses MD5 - pretty secure in this use case. # Extending it with extra algorithms is allowed, but won't be # supported by most browsers. requests and urllib2 are OK though. self.hash_algorithm = "MD5" if "SHA1" in comms_options: # Note 'SHA' rather than 'SHA1'. self.hash_algorithm = "SHA" self.srv_files_mgr = SuiteSrvFilesManager() self.comms_method = glbl_cfg().get(['communication', 'method']) self.get_ha1 = cherrypy.lib.auth_digest.get_ha1_dict_plain( { 'cylc': self.srv_files_mgr.get_auth_item( self.srv_files_mgr.FILE_BASE_PASSPHRASE, suite, content=True), 'anon': NO_PASSPHRASE }, algorithm=self.hash_algorithm) if self.comms_method == 'http': self.cert = None self.pkey = None else: # if self.comms_method in [None, 'https']: try: self.cert = self.srv_files_mgr.get_auth_item( self.srv_files_mgr.FILE_BASE_SSL_CERT, suite) self.pkey = self.srv_files_mgr.get_auth_item( self.srv_files_mgr.FILE_BASE_SSL_PEM, suite) except SuiteServiceFileError: LOG.error("no HTTPS/OpenSSL support. Aborting...") raise CylcError("No HTTPS support. " "Configure user's global.rc to use HTTP.") self.start()
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) tasks = {} # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. for itask in itasks: if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" # Something is very wrong here # Fallback to use "job_log_dirs" list to report the problem job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", []) for job_log_dir in job_log_dirs: point, name, submit_num = job_log_dir.split(os.sep, 2) itask = tasks[(point, name, submit_num)] out += (self.batch_sys_mgr.OUT_PREFIX_SUMMARY + "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n") for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc)
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user global config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str is None: # CYLC_CONF_PATH not defined, use default locations. for conf_dir_1, conf_dir_2, conf_type in [ (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD, upgrader.SITE_CONFIG), (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2, upgrader.USER_CONFIG) ]: fname1 = os.path.join(conf_dir_1, self.CONF_BASE) fname2 = os.path.join(conf_dir_2, self.CONF_BASE) if os.access(fname1, os.F_OK | os.R_OK): fname = fname1 elif os.access(fname2, os.F_OK | os.R_OK): fname = fname2 else: continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning('ignoring bad %s %s:\n%s', conf_type, fname, exc) else: # Abort on bad user file (users can fix it). LOG.error('bad %s %s', conf_type, fname) raise break elif conf_path_str: # CYLC_CONF_PATH defined with a value for path in conf_path_str.split(os.pathsep): fname = os.path.join(path, self.CONF_BASE) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) # (OK if no global.rc is found, just use system defaults). self.transform()
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user global config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str is None: # CYLC_CONF_PATH not defined, use default locations. for conf_dir_1, conf_dir_2, conf_type in [ (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD, upgrader.SITE_CONFIG), (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2, upgrader.USER_CONFIG)]: fname1 = os.path.join(conf_dir_1, self.CONF_BASE) fname2 = os.path.join(conf_dir_2, self.CONF_BASE) if os.access(fname1, os.F_OK | os.R_OK): fname = fname1 elif os.access(fname2, os.F_OK | os.R_OK): fname = fname2 else: continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning( 'ignoring bad %s %s:\n%s', conf_type, fname, exc) else: # Abort on bad user file (users can fix it). LOG.error('bad %s %s', conf_type, fname) raise break elif conf_path_str: # CYLC_CONF_PATH defined with a value for path in conf_path_str.split(os.pathsep): fname = os.path.join(path, self.CONF_BASE) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) # (OK if no global.rc is found, just use system defaults). self.transform()
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log(suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.exception(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log( suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write((ctx_str + '\n').encode()) except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.exception(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" if proc_ctx.ret_code: LOG.error(proc_ctx) else: LOG.debug(proc_ctx) for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists( get_task_job_log(schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SubProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError as exc: LOG.exception(exc)
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" if proc_ctx.ret_code: LOG.error(proc_ctx) else: LOG.debug(proc_ctx) for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists(get_task_job_log( schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SubProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError as exc: LOG.exception(exc)
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), 'suite_uuid': quote(str(self.uuid_str)), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "try_num": itask.get_try_num(), "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote(str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote(str(itask.summary['submit_method_id'])), "submit_time": quote(str(itask.summary['submitted_time_string'])), "start_time": quote(str(itask.summary['started_time_string'])), "finish_time": quote(str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf(itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ( '%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = ( key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), 'suite_uuid': quote(str(self.uuid_str)), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "try_num": itask.get_try_num(), "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote( str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote( str(itask.summary['submit_method_id'])), "submit_time": quote( str(itask.summary['submitted_time_string'])), "start_time": quote( str(itask.summary['started_time_string'])), "finish_time": quote( str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % ( handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = ( TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))