def __init__(self, config, settings, conn, queue, system, application_type): """ :type config: dict (xml) :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject :type conn: multiprocessing.Connection :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = platform.node().upper() self._fqdn = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode(ApplicationMode.MANUAL) self._state = ThreadSafeObject(ApplicationState.OK) self._trigger_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) self._paths = self._init_paths(self.config, settings, application_type) # clients if self._system == PlatformType.LINUX: self.zkclient = KazooClient( hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.app.{0}'.format(self.name))) elif self._system == PlatformType.WINDOWS: self.zkclient = KazooClient(hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler()) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, settings, application_type) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue, conn)
class Application(object): """ Service object to represent an deployed service. """ def __init__(self, config, settings, conn, queue, system, application_type): """ :type config: dict (xml) :type settings: zoom.agent.entities.thread_safe_object.ThreadSafeObject :type conn: multiprocessing.Connection :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = platform.node().upper() self._fqdn = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode(ApplicationMode.MANUAL) self._state = ThreadSafeObject(ApplicationState.OK) self._trigger_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) self._paths = self._init_paths(self.config, settings, application_type) # clients if self._system == PlatformType.LINUX: self.zkclient = KazooClient( hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.app.{0}'.format(self.name))) elif self._system == PlatformType.WINDOWS: self.zkclient = KazooClient(hosts=ZK_CONN_STRING, handler=SequentialThreadingHandler()) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, settings, application_type) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue, conn) @property def app_details(self): return {'name': self.name, 'host': self._host, 'fqdn': self._fqdn, 'platform': self._system, 'mode': self._mode.value, 'state': self._state.value, 'trigger_time': self._trigger_time, 'login_user': self._login_user} def run(self): """ - Start the zookeeper client - Check for already running instances. - Start main loop, periodically checking whether the process has failed. """ self.zkclient.start() # make all action objects start processing predicates self._log.info('Starting to process Actions.') map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() # get global mode AFTER starting actions while self._running: sleep(5) self.uninitialize() @catch_exception(NodeExistsError) @connected def register(self, **kwargs): """ Add entry to the state tree """ if not self.zkclient.exists(self._paths['zk_state_path']): if self._action_is_ready('register'): self._log.info('Registering %s in state tree.' % self.name) self.zkclient.create(self._paths['zk_state_path'], ephemeral=True, makepath=True) # resolve any pager duty alarms self._create_alert_node(AlertActionType.RESOLVE, AlertReason.RESOLVED) # reset restart counters, etc self._proc_client.reset_counters() self._state.set_value(ApplicationState.OK) self._update_agent_node_with_app_details() @catch_exception(NoNodeError) @connected def unregister(self, **kwargs): """Remove entry from state tree""" if self._action_is_ready('unregister'): self._log.info('Un-registering %s from state tree.' % self.name) self.zkclient.delete(self._paths['zk_state_path']) @catch_exception(RuntimeError) def uninitialize(self): """ Gracefully stop this Zookeeper session, then free any resentinels held by the client. """ self._log.info('Stopping Zookeeper client') self._work_manager.stop() map(lambda x: x.stop(), self._actions.values()) # stop actions self.zkclient.stop() self.zkclient.close() @time_this def start(self, **kwargs): """ Start actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ # Same check as self.notify() but needed when start action is # called after process crashes and all predicates are met when on Auto if not self._proc_client.restart_logic.ran_stop \ and self._apptype == ApplicationType.APPLICATION: self._log.info('Not starting. App was stopped with Zoom.') return 0 else: self._log.debug('Start allowed.') if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() pd_enabled = kwargs.get('pd_enabled', True) self._trigger_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STARTING) self._update_agent_node_with_app_details() result = self._proc_client.start() if self._run_check_mode: # Reset to global mode if restart with dep self._check_mode() self._run_check_mode = False if result == 0: self._state.set_value(ApplicationState.OK) else: self._state.set_value(ApplicationState.ERROR) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.FAILEDTOSTART) else: self._log.debug('PD is disabled, not sending alert.') self._update_agent_node_with_app_details() return result @time_this def stop(self, **kwargs): """ Stop actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() self._trigger_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STOPPING) self._update_agent_node_with_app_details() result = self._proc_client.stop(**kwargs) if result != 0 and kwargs.get('argument', 'false') == 'false': self._state.set_value(ApplicationState.ERROR) else: self._state.set_value(ApplicationState.OK) sleep(5) # give everything time to catch up self._update_agent_node_with_app_details() return result def restart(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ # if not self._action_is_ready('restart', allow_undefined=True): # self._log.info('Restart action not ready.') # return self._log.info('Running Restart. Queuing stop, unregister, start.') self._action_queue.clear() self._action_queue.append_unique(Task('stop', kwargs=kwargs)) self._action_queue.append_unique(Task('unregister')) self._action_queue.append_unique(Task('start', kwargs=kwargs)) def dep_restart(self, **kwargs): self._run_check_mode = True # only used in self.start() self._action_queue.append(Task('start_if_ready', pipe=False)) def start_if_ready(self): if self._action_is_ready('start'): self.start() else: self._action_queue.append(Task('react', pipe=False)) @time_this @connected def ignore(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.MANUAL) self._log.info('Mode is now "{0}"'.format(self._mode)) self._update_agent_node_with_app_details() return 0 @time_this @connected def react(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.AUTO) self._log.info('Mode is now "{0}"'.format(self._mode)) self._update_agent_node_with_app_details() return 0 @time_this @connected def notify(self, **kwargs): """ Send notification to zookeeper that a dependency has gone down. """ # Application failed to start. Already sent PD alert if self._state == ApplicationState.ERROR: return pd_enabled = kwargs.get('pd_enabled', True) if not self._action_is_ready('notify'): self._log.info('notify action not defined or not ready.') return if not self._proc_client.restart_logic.ran_stop: # the application has crashed self._state.set_value(ApplicationState.NOTIFY) self._update_agent_node_with_app_details() if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.CRASHED) else: self._log.debug('PD is disabled, not sending alert.') else: self._log.debug("Service shut down gracefully") def terminate(self): """Terminate child thread/process""" self._running = False def _action_is_ready(self, action_name, allow_undefined=False): """ Check if a configured action's predicates are met :type action_name: str :type allow_undefined: bool :rtype: bool """ action = self._actions.get(action_name, None) if allow_undefined: if action is None: return True return action is not None and action.ready @connected def _update_agent_node_with_app_details(self, event=None): """ Register app data with the agent in the state tree. :type event: kazoo.protocol.states.WatchedEvent or None """ if self._running and \ not self.zkclient.exists(self._paths['zk_state_base']): self.zkclient.create(self._paths['zk_state_base']) data, stat = self.zkclient.get(self._paths['zk_state_base']) try: agent_apps = json.loads(data) except ValueError: agent_apps = dict() # check for config conflict other_host = agent_apps.get('host') if other_host is not None and self._host != other_host: self._log.error('There is a config conflict with {0}. Updates ' 'will no longer be sent until it is resolved.' .format(other_host)) self._state.set_value(ApplicationState.CONFIG_ERROR) # make sure data is the most recent if self.app_details != agent_apps: self.zkclient.set(self._paths['zk_state_base'], json.dumps(self.app_details)) self._log.debug('Registering app data {0}'.format(self.app_details)) # set watch if self._state != ApplicationState.CONFIG_ERROR: self.zkclient.get( self._paths['zk_state_base'], watch=self._update_agent_node_with_app_details) else: self._log.error('Shutting down because of config error.') self.terminate() def _init_paths(self, config, settings, atype): """ :rtype: dict """ paths = dict() registrationpath = verify_attribute(config, 'registrationpath', none_allowed=True) if registrationpath is not None: paths['zk_state_base'] = registrationpath else: paths['zk_state_base'] = \ self._pathjoin(settings.get('ZK_STATE_PATH'), atype, self.name) paths['zk_state_path'] = \ self._pathjoin(paths['zk_state_base'], self._host) paths['zk_config_path'] = \ self._pathjoin(settings.get('ZK_CONFIG_PATH'), atype, self.name) paths['zk_agent_path'] = \ self._pathjoin(settings.get('ZK_AGENT_STATE_PATH'), self._host) return paths def _init_proc_client(self, config, settings, atype): """Create the process client.""" command = verify_attribute(config, 'command', none_allowed=True) script = verify_attribute(config, 'script', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int) if restartmax is None: self._log.info('Restartmax not specified. Assuming 3.') restartmax = 3 g_names = self._get_graphite_metric_names() return ProcessClient(name=self.name, command=command, script=script, apptype=atype, system=self._system, restart_logic=RestartLogic(restartmax), graphite_metric_names=g_names, settings=settings) def _init_actions(self, settings): """ :rtype: dict """ action_factory = ActionFactory(component=self, zkclient=self.zkclient, proc_client=self._proc_client, action_queue=self._action_queue, mode=self._mode, system=self._system, pred_list=self._predicates, settings=settings) return action_factory.create(self.config) def _init_work_manager(self, queue, pipe): """ :rtype: zoom.agent.entities.work_manager.WorkManager """ acceptable_work = dict() # actions have additional logic, so use those if available for k, v in self._actions.iteritems(): acceptable_work[k] = v.run # if action is not available, add the method from Application for w in self._settings.get('ALLOWED_WORK', []): if w not in acceptable_work: if hasattr(self, w): acceptable_work[w] = self.__getattribute__(w) else: self._log.error('Class has no method {0}'.format(w)) else: self._log.debug('Method {0} already assigned to action.' .format(w)) manager = WorkManager(self.name, queue, pipe, acceptable_work) manager.start() return manager @connected def _check_mode(self, event=None): """ Check global run mode for the agents. :type event: kazoo.protocol.states.WatchedEvent or None """ modepath = self._pathjoin(self._settings.get('ZK_GLOBAL_PATH'), 'mode') try: data, stat = self.zkclient.get(modepath, watch=self._check_mode) j = json.loads(data) self._log.info('Getting mode from Zookeeper from path: {0}'. format(modepath)) self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL))) self._log.info('Setting mode to "{0}"'.format(self._mode)) self._update_agent_node_with_app_details() except NoNodeError: self._log.info('ZK path {0} does not exist. Assuming mode "manual"' .format(modepath)) except Exception: self._log.exception('An uncaught exception has occurred.') def _pathjoin(self, *args): """ Helper function to join paths. Uses string joining if it is a Windows box. :rtype: str """ if self._system == PlatformType.LINUX: return os.path.join(*args) elif self._system == PlatformType.WINDOWS: return '/'.join(args) def _get_graphite_metric_names(self): """ splits the state path at 'application' and returns the latter index :rtype: dict """ type_path = self._paths.get('zk_state_base')\ .split(self._settings.get('ZK_STATE_PATH') + '/', 1)[1] type_metric = type_path.replace('/', '.') result_path = self._settings.get('GRAPHITE_RESULT_METRIC') runtime_path = self._settings.get('GRAPHITE_RUNTIME_METRIC') return { "result": result_path.format(type_metric), "runtime": runtime_path.format(type_metric) } def _get_current_time(self): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def _get_alert_details(self, alert_action, reason): return { "action": alert_action, "service_key": self._pd_svc_key, "incident_key": self._pathjoin('sentinel', self.name, self._host), "description": ('Sentinel Error: Application {0} {1} on host {2}.' .format(self.name, reason, self._host)), "details": ('Sentinel Error: Application {0} {1} on host {2}.\n' 'Review the application log and contact the appropriate' ' development group.' .format(self.name, reason, self._host)) } @catch_exception(NoNodeError) @connected def _create_alert_node(self, alert_action, reason): """ Create Node in ZooKeeper that will result in a PagerDuty alarm :type alert_action: zoom.common.types.AlertActionType """ alert_details = self._get_alert_details(alert_action, reason) # path example: /foo/sentinel.bar.baz.HOSTFOO alert_path = self._pathjoin( self._settings.get('ZK_ALERT_PATH'), re.sub('/', '.', alert_details['incident_key']) ) if self._env in self._settings.get('PAGERDUTY_ENABLED_ENVIRONMENTS'): self._log.info('Creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) if self.zkclient.exists(alert_path): self.zkclient.set(alert_path, value=json.dumps(alert_details)) else: self.zkclient.create(alert_path, value=json.dumps(alert_details)) else: self._log.info('Not creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) self._log.info('Would have created path {0}'.format(alert_path)) @catch_exception(Exception, traceback=True) @run_only_one('listener_lock') def _reset_after_connection_loss(self): """ Recreates all actions and predicates after connection loss. Recheck the mode and allowed instances. """ if self._running: self._log.info('Application listener callback triggered') map(lambda x: x.stop(), self._actions.values()) # stop actions self._actions.clear() self._predicates = [] self._actions = self._init_actions(self._settings) map(lambda x: x.reset(), self._predicates) # reset predicates map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() self._log.info('Application listener callback complete!') else: self._log.info('The daemon has called for termination. ' 'Not trying to reset after connection loss.') def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}' .format(self._prev_state, state)) if self._prev_state is None and state == KazooState.CONNECTED: pass elif (self._prev_state == KazooState.LOST and state == KazooState.CONNECTED): self.zkclient.handler.spawn(self._reset_after_connection_loss) elif (self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED): pass elif (self._prev_state == KazooState.CONNECTED and state == KazooState.LOST): pass elif (self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST): pass elif (self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED): self.zkclient.handler.spawn(self._reset_after_connection_loss) elif state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) else: self._log.info('Zookeeper Connection in unknown state: {0}' .format(state)) return self._prev_state = state except Exception: self._log.exception('An uncaught exception has occurred') def __str__(self): return self.__repr__() def __repr__(self): return ("{0}(name={1}, runmode={2}, actions={3})" .format(self.__class__.__name__, self.name, self._mode, self._actions.keys()) )
def __init__(self, config, settings, queue, system, application_type, cancel_flag): """ :type config: dict (xml) :type settings: dict :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type self._restart_on_crash = \ verify_attribute(self.config, 'restart_on_crash', none_allowed=True) self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep', none_allowed=True, cast=int, default=5) # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode( ApplicationMode.MANUAL, callback=self._update_agent_node_with_app_details) self._state = ThreadSafeObject( ApplicationState.OK, callback=self._update_agent_node_with_app_details) self._start_stop_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._user_set_in_react = False self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int, default=3) self._rl = RestartLogic( self.name, restartmax, count_callback=self._update_agent_node_with_app_details) self._read_only = False self._paths = self._init_paths(self.config, settings, application_type) # clients self.zkclient = KazooClient( hosts=get_zk_conn_string(), timeout=60.0, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.app.{0}'.format(self.name))) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, application_type, cancel_flag) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue)
class Application(object): """ Service object to represent an deployed service. """ def __init__(self, config, settings, queue, system, application_type, cancel_flag): """ :type config: dict (xml) :type settings: dict :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type self._restart_on_crash = \ verify_attribute(self.config, 'restart_on_crash', none_allowed=True) self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep', none_allowed=True, cast=int, default=5) # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode( ApplicationMode.MANUAL, callback=self._update_agent_node_with_app_details) self._state = ThreadSafeObject( ApplicationState.OK, callback=self._update_agent_node_with_app_details) self._start_stop_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._user_set_in_react = False self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int, default=3) self._rl = RestartLogic( self.name, restartmax, count_callback=self._update_agent_node_with_app_details) self._read_only = False self._paths = self._init_paths(self.config, settings, application_type) # clients self.zkclient = KazooClient( hosts=get_zk_conn_string(), timeout=60.0, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.app.{0}'.format(self.name))) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, application_type, cancel_flag) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue) def app_details(self): return {'name': self.name, 'host': self._host, 'platform': self._system, 'mode': self._mode.value, 'state': self._state.value, 'start_stop_time': self._start_stop_time, 'login_user': self._login_user, 'read_only': self._read_only, 'restart_count': self._rl.count} def run(self): """ - Start the zookeeper client - Check for already running instances. - Start main loop, periodically checking whether the process has failed. """ try: self.zkclient.start() # make all action objects start processing predicates self._log.info('Starting to process Actions.') map(lambda x: x.start(), self._actions.values()) # start actions started = all([i.started for i in self._actions.values()]) if not started: self._log.critical('All actions are not started!') else: self._log.info('All actions started.'.format(started)) self._check_mode() # get global mode AFTER starting actions while self._running: sleep(5) self.uninitialize() except Exception as ex: self._log.critical('There was an exception in the main loop. ' 'In a bad state. ({0})'.format(ex)) @catch_exception(NodeExistsError) @connected def register(self, **kwargs): """ Add entry to the state tree """ action_name = kwargs.get('action_name', 'register') if not self.zkclient.exists(self._paths['zk_state_path']): if self._action_is_ready(action_name): self._log.info('Registering %s in state tree.' % self.name) self.zkclient.create(self._paths['zk_state_path'], ephemeral=True, makepath=True) # resolve any pager duty alarms self._create_alert_node(AlertActionType.RESOLVE, AlertReason.RESOLVED) # reset restart counters, etc self._proc_client.reset_counters() self._state.set_value(ApplicationState.STARTED) else: self._log.info('Action {0} is not ready. Not registering.' .format(action_name)) else: self._log.info('Already registered (node exists).') return 0 @catch_exception(NoNodeError) @connected def unregister(self, **kwargs): """Remove entry from state tree""" action_name = kwargs.get('action_name', 'unregister') if self._action_is_ready(action_name): self._log.info('Un-registering %s from state tree.' % self.name) self.zkclient.delete(self._paths['zk_state_path']) return 0 @catch_exception(RuntimeError) def uninitialize(self): """ Gracefully stop this Zookeeper session, then free any resentinels held by the client. """ self._log.info('Stopping Zookeeper client') self._work_manager.stop() map(lambda x: x.stop(), self._actions.values()) # stop actions del self._predicates[:] # make sure we delete old predicates self.zkclient.stop() self.zkclient.close() return 0 @time_this def start(self, **kwargs): """ Start actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ # Restart from UI: ran_stop=True, stay_down=False # Stop from UI: ran_stop=True, stay_down=True # Crash: ran_stop=False, stay_down=False if self._proc_client.restart_logic.ran_stop \ and self._proc_client.restart_logic.stay_down \ and self._apptype == ApplicationType.APPLICATION: self._log.info('Not starting. App was stopped with Zoom.') # set to OK just in case we're staggered self._state.set_value(ApplicationState.OK) return 0 elif self._proc_client.restart_logic.crashed and \ not self._restart_on_crash: self._log.info('Not starting. The application has crashed.') self._state.set_value(ApplicationState.NOTIFY) return 0 else: self._log.debug('Start allowed.') if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() pd_enabled = kwargs.get('pd_enabled', True) self._start_stop_time = self._get_current_time() # set login user if not set in react if not self._user_set_in_react: self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STARTING) result = self._proc_client.start() if self._run_check_mode: # Reset to global mode if restart with dep self._check_mode() self._run_check_mode = False if result == 0 or result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STARTED) else: self._state.set_value(ApplicationState.ERROR) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.FAILEDTOSTART) else: self._log.debug('PD is disabled, not sending alert.') return result @time_this def stop(self, **kwargs): """ Stop actual process :param kwargs: Passed from: zoom.www.handlers.control_agent_handler.ControlAgentHandler, zoom.agent.action.action.Action """ if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() self._start_stop_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STOPPING) result = self._proc_client.stop(**kwargs) if result != ApplicationStatus.CANCELLED: # give everything time to catch up, not sure why anymore... self._log.info('Sleeping for the configured {0}s after stop.' .format(self._post_stop_sleep)) sleep(self._post_stop_sleep) # reset this value back to False self._user_set_in_react = False if result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STOPPED) elif result != 0: self._state.set_value(ApplicationState.ERROR) else: self._state.set_value(ApplicationState.STOPPED) return result def status(self): """ Log out the status of each configured action. :rtype: str """ out = '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n{0}'.format(self) out += '\n' for i in self._actions.values(): out += '\n{0}'.format(i.status) out += '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n' self._log.info(out) return out def restart(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ # if not self._action_is_ready('restart', allow_undefined=True): # self._log.info('Restart action not ready.') # return self._log.info('Running Restart. Queuing stop, unregister, start.') self._action_queue.clear() self._action_queue.append_unique(Task('stop', kwargs=kwargs)) self._action_queue.append_unique(Task('unregister')) self._action_queue.append_unique(Task('start', kwargs=kwargs)) return 0 def dep_restart(self, **kwargs): self._run_check_mode = True # only used in self.start() self._action_queue.append(Task('start_if_ready', kwargs=kwargs)) return 0 def start_if_ready(self, **kwargs): if self._action_is_ready('start'): self.start(**kwargs) # if start action doesn't exist, a.k.a. read only elif self._actions.get('start', None) is None: self.start(**kwargs) else: self._action_queue.append(Task('react', kwargs=kwargs)) return 0 @time_this @connected def ignore(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.MANUAL) self._log.info('Mode is now "{0}"'.format(self._mode)) return 0 @time_this @connected def react(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.AUTO) self._log.info('Mode is now "{0}"'.format(self._mode)) # when react is called through "restart with dependencies" command self._user_set_in_react = True self._login_user = kwargs.get('login_user', 'Zoom') return 0 @time_this @connected def notify(self, **kwargs): """ Send notification based on arbitrary predicates """ action_name = kwargs.get('action_name', 'notify') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return 1 self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') return 0 @time_this @connected def ensure_running(self, **kwargs): """ Essentially a clone of `notify`, but tailored for process monitoring. """ # Application failed to start. Already sent PD alert if self._state == ApplicationState.ERROR: return 1 action_name = kwargs.get('action_name', 'ensure_running') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return if not self._proc_client.restart_logic.ran_stop: # the application has crashed self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') else: self._log.debug("Service shut down gracefully") return 0 def terminate(self): """Terminate child thread/process""" self._running = False return 0 def _action_is_ready(self, action_name, allow_undefined=False): """ Check if a configured action's predicates are met :type action_name: str :type allow_undefined: bool :rtype: bool """ action = self._actions.get(action_name, None) if allow_undefined: if action is None: return True return action is not None and action.ready @catch_exception(NoNodeError) @connected def _update_agent_node_with_app_details(self, event=None): """ Register app data with the agent in the state tree. :type event: kazoo.protocol.states.WatchedEvent or None """ if self._running and \ not self.zkclient.exists(self._paths['zk_state_base']): self.zkclient.create(self._paths['zk_state_base'], makepath=True) data, stat = self.zkclient.get(self._paths['zk_state_base']) try: agent_apps = json.loads(data) except ValueError: agent_apps = dict() # check for config conflict other_host = agent_apps.get('host') if other_host is not None and self._host != other_host: self._log.error('There is a config conflict with {0}. Updates ' 'will no longer be sent until it is resolved.' .format(other_host)) self._state.set_value(ApplicationState.CONFIG_ERROR, run_callback=False) # make sure data is the most recent if self.app_details() != agent_apps: self.zkclient.set(self._paths['zk_state_base'], json.dumps(self.app_details())) self._log.debug('Registering app data {0}' .format(self.app_details())) # set watch if self._state != ApplicationState.CONFIG_ERROR: self.zkclient.get( self._paths['zk_state_base'], watch=self._update_agent_node_with_app_details) else: self._log.error('Shutting down because of config error.') self.terminate() def _init_paths(self, config, settings, atype): """ :rtype: dict """ paths = dict() paths['zk_state_base'] = verify_attribute( config, 'registrationpath', none_allowed=True, default=self._pathjoin(settings.get('zookeeper', {}).get('state'), atype, self.name) ) paths['zk_state_path'] = \ self._pathjoin(paths['zk_state_base'], self._host) paths['zk_config_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('config'), atype, self.name) paths['zk_agent_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('agent_state'), self._host) return paths def _init_proc_client(self, config, atype, cancel_flag): """Create the process client.""" start_cmd = verify_attribute(config, 'start_cmd', none_allowed=True) stop_cmd = verify_attribute(config, 'stop_cmd', none_allowed=True) status_cmd = verify_attribute(config, 'status_cmd', none_allowed=True) script = verify_attribute(config, 'script', none_allowed=True) g_names = self._get_graphite_metric_names() return ProcessClient(name=self.name, start_cmd=start_cmd, stop_cmd=stop_cmd, status_cmd=status_cmd, script=script, apptype=atype, restart_logic=self._rl, graphite_metric_names=g_names, cancel_flag=cancel_flag) def _init_actions(self, settings): """ :rtype: dict """ action_factory = ActionFactory(component=self, zkclient=self.zkclient, proc_client=self._proc_client, action_queue=self._action_queue, mode=self._mode, system=self._system, pred_list=self._predicates, app_state=self._state, settings=settings) actions = action_factory.create(self.config) self._determine_read_only(actions) return actions def _determine_read_only(self, actions): start_action = actions.get('start', None) if start_action is None: self._read_only = True elif start_action.disabled is True: self._read_only = True else: self._read_only = False def _init_work_manager(self, queue): """ :rtype: zoom.agent.entities.work_manager.WorkManager """ acceptable_work = dict() # actions have additional logic, so use those if available for k, v in self._actions.iteritems(): acceptable_work[k] = v.run # if action is not available, add public methods for attribute in [a for a in dir(self) if not a.startswith('_')]: obj = getattr(self, attribute) if hasattr(obj, '__call__'): if attribute not in acceptable_work: acceptable_work[attribute] = obj else: self._log.debug('Method {0} already assigned to action.' .format(attribute)) manager = WorkManager(self.name, queue, acceptable_work) manager.start() return manager @connected def _check_mode(self, event=None): """ Check global run mode for the agents. :type event: kazoo.protocol.states.WatchedEvent or None """ global_path = self._settings.get('zookeeper', {}).get('global_config') if global_path is None: self._log.warning('Received no global config path. Zoom will be ' 'unable to change the global mode.') return modepath = self._pathjoin(global_path, 'mode') try: data, stat = self.zkclient.get(modepath, watch=self._check_mode) j = json.loads(data) self._log.info('Getting mode from Zookeeper from path: {0}'. format(modepath)) self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL))) self._log.info('Setting mode to "{0}"'.format(self._mode)) except NoNodeError: self._log.info('ZK path {0} does not exist. Assuming mode "manual"' .format(modepath)) except Exception: self._log.exception('An uncaught exception has occurred.') def _pathjoin(self, *args): """ Helper function to join paths. Uses string joining if it is a Windows box. :rtype: str """ if self._system == PlatformType.LINUX: return os.path.join(*args) elif self._system == PlatformType.WINDOWS: return '/'.join(args) def _get_graphite_metric_names(self): """ splits the state path at 'application' and returns the latter index :rtype: dict """ names = {"result": None, "runtime": None, "updown": None} type_path = self._paths.get('zk_state_base')\ .split(self._settings.get('zookeeper', {}).get('state') + '/', 1)[1] type_metric = type_path.replace('/', '.') graphite = self._settings.get('graphite') if graphite is not None: result_path = str(graphite.get('result')) runtime_path = str(graphite.get('runtime')) updown_path = str(graphite.get('updown')) names["result"] = result_path.format(type_metric) names["runtime"] = runtime_path.format(type_metric) names["updown"] = updown_path.format(type_metric) return names def _get_current_time(self): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def _get_alert_details(self, alert_action, reason): return { "action": alert_action, "service_key": self._pd_svc_key, "incident_key": self._pathjoin('sentinel', self.name, self._host), "description": ('Sentinel Error: name={0}, host={1}, issue="{2}".' .format(self.name, self._host, reason)), "details": ('Sentinel Error: name={0}, host={1}, issue="{2}".\n' 'Review the application log and contact the appropriate' ' development group.' .format(self.name, self._host, reason)) } @catch_exception(NoNodeError) @connected def _create_alert_node(self, alert_action, reason): """ Create Node in ZooKeeper that will result in a PagerDuty alarm :type alert_action: zoom.common.types.AlertActionType """ alert_details = self._get_alert_details(alert_action, reason) # path example: /foo/sentinel.bar.baz.HOSTFOO alert = self._settings.get('zookeeper', {}).get('alert') if alert is None: self._log.warning('Was given no alert path. This sentinel will be ' 'unable to forward alerts to Zoom.') return alert_path = self._pathjoin(alert, re.sub('/', '.', alert_details['incident_key'])) if self._env in self._settings.get('pagerduty', {}).get('enabled_environments', []): self._log.info('Creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) if self.zkclient.exists(alert_path): self.zkclient.set(alert_path, value=json.dumps(alert_details)) else: self.zkclient.create(alert_path, value=json.dumps(alert_details)) else: self._log.info('Not creating alert "{0}" node for env: {1}' .format(alert_action, self._env)) self._log.info('Would have created path {0}'.format(alert_path)) @catch_exception(Exception, traceback=True) @run_only_one('listener_lock') def _reset_after_connection_loss(self): """ Recreates all actions and predicates after connection loss. Recheck the mode and allowed instances. """ if self._running: self._log.info('Application listener callback triggered') map(lambda x: x.stop(), self._actions.values()) # stop actions self._actions.clear() self._predicates = [] self._actions = self._init_actions(self._settings) map(lambda x: x.reset(), self._predicates) # reset predicates map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() self._log.info('Application listener callback complete!') else: self._log.info('The daemon has called for termination. ' 'Not trying to reset after connection loss.') def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}' .format(self._prev_state, state)) if self._prev_state is None and state == KazooState.CONNECTED: pass elif self._prev_state == KazooState.LOST and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED: pass elif self._prev_state == KazooState.CONNECTED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) else: self._log.info('Zookeeper Connection in unknown state: {0}' .format(state)) return self._prev_state = state except Exception as ex: self._log.exception('An uncaught exception has occurred in the ' 'listener: {0}'.format(ex)) def __str__(self): return self.__repr__() def __repr__(self): return ("{0}(name={1}, runmode={2})" .format(self.__class__.__name__, self.name, self._mode))
def __init__(self, config, settings, queue, system, application_type, cancel_flag): """ :type config: dict (xml) :type settings: dict :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type self._restart_on_crash = \ verify_attribute(self.config, 'restart_on_crash', none_allowed=True) self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep', none_allowed=True, cast=int, default=5) # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode( ApplicationMode.MANUAL, callback=self._update_agent_node_with_app_details) self._state = ThreadSafeObject( ApplicationState.OK, callback=self._update_agent_node_with_app_details) self._start_stop_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._user_set_in_react = False self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int, default=3) self._rl = RestartLogic( self.name, restartmax, count_callback=self._update_agent_node_with_app_details) self._read_only = False self._paths = self._init_paths(self.config, settings, application_type) # clients self.zkclient = KazooClient(hosts=get_zk_conn_string(), timeout=60.0, handler=SequentialThreadingHandler(), logger=logging.getLogger( 'kazoo.app.{0}'.format(self.name))) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, application_type, cancel_flag) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue)
class Application(object): """ Service object to represent an deployed service. """ def __init__(self, config, settings, queue, system, application_type, cancel_flag): """ :type config: dict (xml) :type settings: dict :type queue: zoom.agent.entities.unique_queue.UniqueQueue :type system: zoom.common.types.PlatformType :type application_type: zoom.common.types.ApplicationType :type cancel_flag: zoom.agent.entities.thread_safe_object.ThreadSafeObject """ self.config = config self._settings = settings self.name = verify_attribute(self.config, 'id', none_allowed=False) self._log = logging.getLogger('sent.{0}.app'.format(self.name)) # informational attributes self._host = socket.getfqdn() self._system = system self._predicates = list() self._running = True # used to manually stop the run loop self._prev_state = None self._actions = dict() # created in _reset_watches on zk connect self._env = os.environ.get('EnvironmentToUse', 'Staging') self._apptype = application_type self._restart_on_crash = \ verify_attribute(self.config, 'restart_on_crash', none_allowed=True) self._post_stop_sleep = verify_attribute(self.config, 'post_stop_sleep', none_allowed=True, cast=int, default=5) # tool-like attributes self.listener_lock = Lock() self._action_queue = queue self._mode = ApplicationMode( ApplicationMode.MANUAL, callback=self._update_agent_node_with_app_details) self._state = ThreadSafeObject( ApplicationState.OK, callback=self._update_agent_node_with_app_details) self._start_stop_time = '' # Default to empty string for comparison self._login_user = '******' # Default to Zoom self._user_set_in_react = False self._run_check_mode = False self._pd_svc_key = verify_attribute(config, 'pagerduty_service', none_allowed=True) restartmax = verify_attribute(config, 'restartmax', none_allowed=True, cast=int, default=3) self._rl = RestartLogic( self.name, restartmax, count_callback=self._update_agent_node_with_app_details) self._read_only = False self._paths = self._init_paths(self.config, settings, application_type) # clients self.zkclient = KazooClient(hosts=get_zk_conn_string(), timeout=60.0, handler=SequentialThreadingHandler(), logger=logging.getLogger( 'kazoo.app.{0}'.format(self.name))) self.zkclient.add_listener(self._zk_listener) self._proc_client = self._init_proc_client(self.config, application_type, cancel_flag) self._actions = self._init_actions(settings) self._work_manager = self._init_work_manager(self._action_queue) def app_details(self): return { 'name': self.name, 'host': self._host, 'platform': self._system, 'mode': self._mode.value, 'state': self._state.value, 'start_stop_time': self._start_stop_time, 'login_user': self._login_user, 'read_only': self._read_only, 'restart_count': self._rl.count } def run(self): """ - Start the zookeeper client - Check for already running instances. - Start main loop, periodically checking whether the process has failed. """ try: self.zkclient.start() # make all action objects start processing predicates self._log.info('Starting to process Actions.') map(lambda x: x.start(), self._actions.values()) # start actions started = all([i.started for i in self._actions.values()]) if not started: self._log.critical('All actions are not started!') else: self._log.info('All actions started.'.format(started)) self._check_mode() # get global mode AFTER starting actions while self._running: sleep(5) self.uninitialize() except Exception as ex: self._log.critical('There was an exception in the main loop. ' 'In a bad state. ({0})'.format(ex)) @catch_exception(NodeExistsError) @connected def register(self, **kwargs): """ Add entry to the state tree """ action_name = kwargs.get('action_name', 'register') if not self.zkclient.exists(self._paths['zk_state_path']): if self._action_is_ready(action_name): self._log.info('Registering %s in state tree.' % self.name) self.zkclient.create(self._paths['zk_state_path'], ephemeral=True, makepath=True) # resolve any pager duty alarms self._create_alert_node(AlertActionType.RESOLVE, AlertReason.RESOLVED) # reset restart counters, etc self._proc_client.reset_counters() self._state.set_value(ApplicationState.STARTED) else: self._log.info( 'Action {0} is not ready. Not registering.'.format( action_name)) else: self._log.info('Already registered (node exists).') return 0 @catch_exception(NoNodeError) @connected def unregister(self, **kwargs): """Remove entry from state tree""" action_name = kwargs.get('action_name', 'unregister') if self._action_is_ready(action_name): self._log.info('Un-registering %s from state tree.' % self.name) self.zkclient.delete(self._paths['zk_state_path']) return 0 @catch_exception(RuntimeError) def uninitialize(self): """ Gracefully stop this Zookeeper session, then free any resentinels held by the client. """ self._log.info('Stopping Zookeeper client') self._work_manager.stop() map(lambda x: x.stop(), self._actions.values()) # stop actions del self._predicates[:] # make sure we delete old predicates self.zkclient.stop() self.zkclient.close() return 0 @time_this def start(self, **kwargs): """ Start actual process :param kwargs: passed from zoom.handlers.control_agent_handlers """ # Restart from UI: ran_stop=True, stay_down=False # Stop from UI: ran_stop=True, stay_down=True # Crash: ran_stop=False, stay_down=False if self._proc_client.restart_logic.ran_stop \ and self._proc_client.restart_logic.stay_down \ and self._apptype == ApplicationType.APPLICATION: self._log.info('Not starting. App was stopped with Zoom.') # set to OK just in case we're staggered self._state.set_value(ApplicationState.OK) return 0 elif self._proc_client.restart_logic.crashed and \ not self._restart_on_crash: self._log.info('Not starting. The application has crashed.') self._state.set_value(ApplicationState.NOTIFY) return 0 else: self._log.debug('Start allowed.') if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() pd_enabled = kwargs.get('pd_enabled', True) self._start_stop_time = self._get_current_time() # set login user if not set in react if not self._user_set_in_react: self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STARTING) result = self._proc_client.start() if self._run_check_mode: # Reset to global mode if restart with dep self._check_mode() self._run_check_mode = False if result == 0 or result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STARTED) else: self._state.set_value(ApplicationState.ERROR) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, AlertReason.FAILEDTOSTART) else: self._log.debug('PD is disabled, not sending alert.') return result @time_this def stop(self, **kwargs): """ Stop actual process :param kwargs: Passed from: zoom.www.handlers.control_agent_handler.ControlAgentHandler, zoom.agent.action.action.Action """ if kwargs.get('reset', True): self._proc_client.reset_counters() if kwargs.get('pause', False): self.ignore() self._start_stop_time = self._get_current_time() self._login_user = kwargs.get('login_user', 'Zoom') self._state.set_value(ApplicationState.STOPPING) result = self._proc_client.stop(**kwargs) if result != ApplicationStatus.CANCELLED: # give everything time to catch up, not sure why anymore... self._log.info( 'Sleeping for the configured {0}s after stop.'.format( self._post_stop_sleep)) sleep(self._post_stop_sleep) # reset this value back to False self._user_set_in_react = False if result == ApplicationStatus.CANCELLED: self._state.set_value(ApplicationState.STOPPED) elif result != 0: self._state.set_value(ApplicationState.ERROR) else: self._state.set_value(ApplicationState.STOPPED) return result def status(self): """ Log out the status of each configured action. :rtype: str """ out = '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n{0}'.format(self) out += '\n' for i in self._actions.values(): out += '\n{0}'.format(i.status) out += '\n' out += '#' * 40 + ' STATUS ' + '#' * 40 out += '\n' self._log.info(out) return out def restart(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ # if not self._action_is_ready('restart', allow_undefined=True): # self._log.info('Restart action not ready.') # return self._log.info('Running Restart. Queuing stop, unregister, start.') self._action_queue.clear() self._action_queue.append_unique(Task('stop', kwargs=kwargs)) self._action_queue.append_unique(Task('unregister')) self._action_queue.append_unique(Task('start', kwargs=kwargs)) return 0 def dep_restart(self, **kwargs): self._run_check_mode = True # only used in self.start() self._action_queue.append(Task('start_if_ready', kwargs=kwargs)) return 0 def start_if_ready(self, **kwargs): start_action = self._actions.get('start', None) if start_action is not None and start_action.ready: start_action.run(**kwargs) # if start action doesn't exist, a.k.a. read only elif start_action is None: self.start(**kwargs) else: self._action_queue.append(Task('react', kwargs=kwargs)) return 0 @time_this @connected def ignore(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.MANUAL) self._log.info('Mode is now "{0}"'.format(self._mode)) return 0 @time_this @connected def react(self, **kwargs): """ :param kwargs: passed from zoom.handlers.control_agent_handlers """ self._mode.set_value(ApplicationMode.AUTO) self._log.info('Mode is now "{0}"'.format(self._mode)) # when react is called through "restart with dependencies" command self._user_set_in_react = True self._login_user = kwargs.get('login_user', 'Zoom') return 0 @time_this @connected def notify(self, **kwargs): """ Send notification based on arbitrary predicates """ action_name = kwargs.get('action_name', 'notify') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return 1 self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') return 0 @time_this @connected def ensure_running(self, **kwargs): """ Essentially a clone of `notify`, but tailored for process monitoring. """ # Application failed to start. Already sent PD alert if self._state == ApplicationState.ERROR: return 1 action_name = kwargs.get('action_name', 'ensure_running') pd_enabled = kwargs.get('pd_enabled', True) pd_reason = kwargs.get('pd_reason', None) if pd_reason is None: pd_reason = AlertReason.CRASHED if not self._action_is_ready(action_name): self._log.info('notify action not defined or not ready.') return if not self._proc_client.restart_logic.ran_stop: # the application has crashed self._state.set_value(ApplicationState.NOTIFY) if pd_enabled: self._create_alert_node(AlertActionType.TRIGGER, pd_reason) else: self._log.debug('PD is disabled, not sending alert.') else: self._log.debug("Service shut down gracefully") return 0 def terminate(self): """Terminate child thread/process""" self._running = False return 0 def _action_is_ready(self, action_name, allow_undefined=False): """ Check if a configured action's predicates are met :type action_name: str :type allow_undefined: bool :rtype: bool """ action = self._actions.get(action_name, None) if allow_undefined: if action is None: return True return action is not None and action.ready @catch_exception(NoNodeError) @connected def _update_agent_node_with_app_details(self, event=None): """ Register app data with the agent in the state tree. :type event: kazoo.protocol.states.WatchedEvent or None """ if self._running and \ not self.zkclient.exists(self._paths['zk_state_base']): self.zkclient.create(self._paths['zk_state_base'], makepath=True) data, stat = self.zkclient.get(self._paths['zk_state_base']) try: agent_apps = json.loads(data) except ValueError: agent_apps = dict() # check for config conflict other_host = agent_apps.get('host') if other_host is not None and self._host != other_host: self._log.error( 'There is a config conflict with {0}. Updates ' 'will no longer be sent until it is resolved.'.format( other_host)) self._state.set_value(ApplicationState.CONFIG_ERROR, run_callback=False) # make sure data is the most recent if self.app_details() != agent_apps: self.zkclient.set(self._paths['zk_state_base'], json.dumps(self.app_details())) self._log.debug('Registering app data {0}'.format( self.app_details())) # set watch if self._state != ApplicationState.CONFIG_ERROR: self.zkclient.get(self._paths['zk_state_base'], watch=self._update_agent_node_with_app_details) else: self._log.error('Shutting down because of config error.') self.terminate() def _init_paths(self, config, settings, atype): """ :rtype: dict """ paths = dict() paths['zk_state_base'] = verify_attribute( config, 'registrationpath', none_allowed=True, default=self._pathjoin( settings.get('zookeeper', {}).get('state'), atype, self.name)) paths['zk_state_path'] = \ self._pathjoin(paths['zk_state_base'], self._host) paths['zk_config_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('config'), atype, self.name) paths['zk_agent_path'] = \ self._pathjoin(settings.get('zookeeper', {}).get('agent_state'), self._host) return paths def _init_proc_client(self, config, atype, cancel_flag): """Create the process client.""" start_cmd = verify_attribute(config, 'start_cmd', none_allowed=True) stop_cmd = verify_attribute(config, 'stop_cmd', none_allowed=True) status_cmd = verify_attribute(config, 'status_cmd', none_allowed=True) script = verify_attribute(config, 'script', none_allowed=True) g_names = self._get_graphite_metric_names() return ProcessClient(name=self.name, start_cmd=start_cmd, stop_cmd=stop_cmd, status_cmd=status_cmd, script=script, apptype=atype, restart_logic=self._rl, graphite_metric_names=g_names, cancel_flag=cancel_flag) def _init_actions(self, settings): """ :rtype: dict """ action_factory = ActionFactory(component=self, zkclient=self.zkclient, proc_client=self._proc_client, action_queue=self._action_queue, mode=self._mode, system=self._system, pred_list=self._predicates, app_state=self._state, settings=settings) actions = action_factory.create(self.config) self._determine_read_only(actions) return actions def _determine_read_only(self, actions): # Sentinel config may include either start or restart blocks, if either are disabled show as read-only start_action = actions.get('start', None) restart_action = actions.get('restart', None) # Two special cases - both start and restart and neither if start_action and restart_action: if start_action.disabled and restart_action.disabled: self._read_only = True else: self._read_only = False return elif not start_action and not restart_action: self._log.warning( 'Sentinel config contains neither start nor restart predicates, assuming readonly' ) self._read_only = True return # At this point either start action or restart action must exist if not start_action: if restart_action.disabled: self._read_only = True else: self._read_only = False elif not restart_action: if start_action.disabled: self._read_only = True else: self._read_only = False else: self._log.warning('Unhandled read-only configuration') self._read_only = False def _init_work_manager(self, queue): """ :rtype: zoom.agent.entities.work_manager.WorkManager """ acceptable_work = dict() # actions have additional logic, so use those if available for k, v in self._actions.iteritems(): acceptable_work[k] = v.run # if action is not available, add public methods for attribute in [a for a in dir(self) if not a.startswith('_')]: obj = getattr(self, attribute) if hasattr(obj, '__call__'): if attribute not in acceptable_work: acceptable_work[attribute] = obj else: self._log.debug( 'Method {0} already assigned to action.'.format( attribute)) manager = WorkManager(self.name, queue, acceptable_work) manager.start() return manager @connected def _check_mode(self, event=None): """ Check global run mode for the agents. :type event: kazoo.protocol.states.WatchedEvent or None """ global_path = self._settings.get('zookeeper', {}).get('global_config') if global_path is None: self._log.warning('Received no global config path. Zoom will be ' 'unable to change the global mode.') return modepath = self._pathjoin(global_path, 'mode') try: data, stat = self.zkclient.get(modepath, watch=self._check_mode) j = json.loads(data) self._log.info( 'Getting mode from Zookeeper from path: {0}'.format(modepath)) self._mode.set_value(str(j.get(u'mode', ApplicationMode.MANUAL))) self._log.info('Setting mode to "{0}"'.format(self._mode)) except NoNodeError: self._log.info( 'ZK path {0} does not exist. Assuming mode "manual"'.format( modepath)) except Exception: self._log.exception('An uncaught exception has occurred.') def _pathjoin(self, *args): """ Helper function to join paths. Uses string joining if it is a Windows box. :rtype: str """ if self._system == PlatformType.LINUX: return os.path.join(*args) elif self._system == PlatformType.WINDOWS: return '/'.join(args) def _get_graphite_metric_names(self): """ splits the state path at 'application' and returns the latter index :rtype: dict """ names = {"result": None, "runtime": None, "updown": None} type_path = self._paths.get('zk_state_base')\ .split(self._settings.get('zookeeper', {}).get('state') + '/', 1)[1] type_metric = type_path.replace('/', '.') graphite = self._settings.get('graphite') if graphite is not None: result_path = str(graphite.get('result')) runtime_path = str(graphite.get('runtime')) updown_path = str(graphite.get('updown')) names["result"] = result_path.format(type_metric) names["runtime"] = runtime_path.format(type_metric) names["updown"] = updown_path.format(type_metric) return names def _get_current_time(self): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def _get_alert_details(self, alert_action, reason): return { "action": alert_action, "service_key": self._pd_svc_key, "incident_key": self._pathjoin('sentinel', self.name, self._host), "description": ('Sentinel Error: name={0}, host={1}, issue="{2}".'.format( self.name, self._host, reason)), "details": ('Sentinel Error: name={0}, host={1}, issue="{2}".\n' 'Review the application log and contact the appropriate' ' development group.'.format(self.name, self._host, reason)) } @catch_exception(NoNodeError) @connected def _create_alert_node(self, alert_action, reason): """ Create Node in ZooKeeper that will result in a PagerDuty alarm :type alert_action: zoom.common.types.AlertActionType """ alert_details = self._get_alert_details(alert_action, reason) # path example: /foo/sentinel.bar.baz.HOSTFOO alert = self._settings.get('zookeeper', {}).get('alert') if alert is None: self._log.warning('Was given no alert path. This sentinel will be ' 'unable to forward alerts to Zoom.') return alert_path = self._pathjoin( alert, re.sub('/', '.', alert_details['incident_key'])) if self._env in self._settings.get('pagerduty', {}).get('enabled_environments', []): self._log.info('Creating alert "{0}" node for env: {1}'.format( alert_action, self._env)) if self.zkclient.exists(alert_path): self.zkclient.set(alert_path, value=json.dumps(alert_details)) else: self.zkclient.create(alert_path, value=json.dumps(alert_details)) else: self._log.info('Not creating alert "{0}" node for env: {1}'.format( alert_action, self._env)) self._log.info('Would have created path {0}'.format(alert_path)) @catch_exception(Exception, traceback=True) @run_only_one('listener_lock') def _reset_after_connection_loss(self): """ Recreates all actions and predicates after connection loss. Recheck the mode and allowed instances. """ if self._running: self._log.info('Application listener callback triggered') map(lambda x: x.stop(), self._actions.values()) # stop actions self._actions.clear() self._predicates = [] self._actions = self._init_actions(self._settings) map(lambda x: x.reset(), self._predicates) # reset predicates map(lambda x: x.start(), self._actions.values()) # start actions self._check_mode() self._log.info('Application listener callback complete!') else: self._log.info('The daemon has called for termination. ' 'Not trying to reset after connection loss.') def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}'.format( self._prev_state, state)) if self._prev_state is None and state == KazooState.CONNECTED: pass elif self._prev_state == KazooState.LOST and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED: pass elif self._prev_state == KazooState.CONNECTED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST: pass elif self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) elif state == KazooState.CONNECTED: self.zkclient.handler.spawn(self._reset_after_connection_loss) else: self._log.info( 'Zookeeper Connection in unknown state: {0}'.format(state)) return self._prev_state = state except Exception as ex: self._log.exception('An uncaught exception has occurred in the ' 'listener: {0}'.format(ex)) def __str__(self): return self.__repr__() def __repr__(self): return ("{0}(name={1}, runmode={2})".format(self.__class__.__name__, self.name, self._mode))