def run(self): # get correct pid context.set_pid() # set thread name current_thread().name = 'supervisor' # get initial config from cloud self.talk_to_cloud(initial=True) # init object managers self.init_object_managers() if not self.object_managers: context.log.error('no object managers configured, stopping') return # run bridge manager self.bridge_object = Bridge() self.bridge = spawn(self.bridge_object.start) # main cycle while True: time.sleep(5.0) if not self.is_running: break try: context.inc_action_id() for object_manager_name in self.object_manager_order: object_manager = self.object_managers[object_manager_name] object_manager.run() try: if context.objects.root_object: context.inc_action_id() self.talk_to_cloud( root_object=context.objects.root_object.definition) else: pass # context.default_log.debug('No root object defined during supervisor main run') except AmplifyCriticalException: pass self.check_bridge() except OSError as e: if e.errno == 12: # OSError errno 12 is a memory error (unable to allocate, out of memory, etc.) context.log.error('OSError: [Errno %s] %s' % (e.errno, e.message), exc_info=True) continue else: raise e
def check_bridge(self): """ Check containers threads, restart if some failed """ if self.bridge.ready and self.bridge.exception: context.log.debug('bridge exception: %s' % self.bridge.exception) self.bridge = gevent.spawn(Bridge().start)
def run(self): # get correct pid context.set_pid() # set thread name current_thread().name = 'supervisor' # get initial config from cloud self.talk_to_cloud(initial=True) # init object managers self.init_object_managers() if not self.object_managers: context.log.error('no object managers configured, stopping') return # run bridge manager self.bridge_object = Bridge() self.bridge = spawn(self.bridge_object.start) # main cycle while True: time.sleep(5.0) if not self.is_running: break try: context.inc_action_id() for object_manager_name in self.object_manager_order: object_manager = self.object_managers[object_manager_name] object_manager.run() try: if context.objects.root_object: if context.objects.root_object.definition and context.objects.root_object.definition_healthy: context.inc_action_id() self.talk_to_cloud(root_object=context.objects.root_object.definition) else: context.log.error('Problem with root object definition, agent stopping') self.stop() else: pass # context.default_log.debug('No root object defined during supervisor main run') except AmplifyCriticalException: pass self.check_bridge() except OSError as e: if e.errno == 12: # OSError errno 12 is a memory error (unable to allocate, out of memory, etc.) context.log.error('OSError: [Errno %s] %s' % (e.errno, e.message), exc_info=True) continue else: raise e
class Supervisor(object): """ Agent supervisor Starts dedicated threads for each manager. """ # TODO: Unify the manager init and supervision process (object managers vs. bridge) MANAGER_CLASS = '%sManager' MANAGER_MODULE = 'amplify.agent.managers.%s.%s' def __init__(self, foreground=False, debug=False): """ Supervisor constructor :param foreground: bool run in foreground if True :param debug: bool run in debug mode if True """ # daemon specific self.stdin_path = '/dev/null' if foreground or debug: self.stdout_path = '/dev/stdout' self.stderr_path = '/dev/stderr' else: self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' self.pidfile_path = context.app_config['daemon']['pid'] self.pidfile_timeout = 1 # init self.object_managers = {} self.object_manager_order = ['system', 'nginx', 'status', 'api'] self.external_object_manager_types = [] self.external_managers = {} self.external_modules = [] self.bridge = None self.bridge_object = None self.start_time = int(time.time()) self.last_cloud_talk_time = 0 self.last_cloud_talk_restart = 0 self.cloud_talk_fails = 0 self.cloud_talk_delay = 0 self.is_running = True # debug mode parameters self.debug_mode = debug self.debug_mode_time = 300 # five minutes def init_object_managers(self): """ Tries to load and create all internal object managers specified in config """ for object_type in self.object_manager_order: try: object_manager_classname = self.MANAGER_CLASS % object_type.title( ) manager_class = loader.import_class( self.MANAGER_MODULE % (object_type, object_manager_classname)) # copy object configs if object_type in self.object_managers: object_configs = copy.copy( self.object_managers[object_type].object_configs) else: object_configs = None self.object_managers[object_type] = manager_class( object_configs=object_configs) context.log.debug('loaded "%s" object manager from %s' % (object_type, manager_class)) except: context.log.error('failed to load %s object manager' % object_type, exc_info=True) def load_ext_managers(self): """ Tries to load and create all ext managers to be run during primary event loop. """ import pkgutil import inspect import amplify.ext as extensions from amplify.agent.common.util.configtypes import boolean from amplify.ext.abstract.manager import (AbstractExtManager, ExtObjectManager) from amplify.ext.abstract.config import AbstractExtConfig base_prefix = extensions.__name__ + '.' # 'amplify.ext.' def enabled_extension(modname): # not defined in the config if modname not in context.app_config.get('extensions', {}): return False # not enabled if not boolean( context.app_config.get('extensions', {}).get( modname, False)): return False # not enabled in backend if not context.capabilities[modname]: return False return True def _recursive_manager_init(inspected_package, prefix=base_prefix, top_mod=None): """ Takes a package and iterates all of the modules. If it's a module (e.g. not a package), it will look for ObjectManager class definitions and add and instance of it to the object_managers store. :param inspected_package: Package :return: List Module names inspected by the call """ passed_top_mod = top_mod is not None module_paths = dict() # iter all modules in package for _, modname, ispkg in pkgutil.iter_modules( inspected_package.__path__): # add module name as the start of an inner modpath new_mod_path = {modname: None} # set the top_mod if not passed from outer scope if not passed_top_mod: top_mod = modname # don't scan modules that aren't enabled if not enabled_extension(top_mod): context.log.debug('ignored "%s" module during ext scan' % top_mod) continue current_loc = prefix + modname # import module mod = __import__(current_loc, fromlist='dummy') if ispkg: # if it is another package, recursively call this function current_prefix = current_loc + '.' recursive_mod_path = _recursive_manager_init( mod, prefix=current_prefix, top_mod=top_mod) # add the mod path the recursive function walked to current new_mod_path[modname] = recursive_mod_path else: # otherwise if it is a module walk the objects to find ObjectManagers for obj in mod.__dict__.itervalues(): # if it is a class defintion if inspect.isclass(obj): # and it is a subclass of ObjectManager (but not # ObjectManager itself) if (issubclass(obj, ExtObjectManager) and obj.__name__ not in ExtObjectManager.__name__): # add to object_managers self.object_managers[obj.type] = obj() self.external_object_manager_types.append( obj.type) context.log.debug( 'loaded "%s" object manager from %s' % (obj.type, obj)) # or it is a subclass of AbstractManager (but not # AbstractManager itself or an ObjectManager) elif (issubclass(obj, AbstractExtManager) and obj.__name__ not in (AbstractExtManager.__name__, ExtObjectManager.__name__)): # add to external_managers self.external_managers[obj.name] = obj context.log.debug( 'loaded "%s" manager from %s' % (obj.name, obj)) # or it is a subclass of AbstractConfig (but not # AbstractConfig itself) elif (issubclass(obj, AbstractExtConfig) and obj.__name__ != AbstractExtConfig.__name__): # check that the extension is enabled in the config # add to ConfigTank config = obj() context.app_config.add(config) context.log.debug( 'loaded "%s" extension config from %s' % (obj.ext, obj)) # add now completed modpath walk to return module_paths.update(new_mod_path) return module_paths # start the recursive loading process... _recursive_manager_init(extensions) def run(self): # get correct pid context.set_pid() # set thread name current_thread().name = 'supervisor' # get initial config from cloud self.talk_to_cloud(initial=True) # init object managers self.init_object_managers() # load ext managers self.load_ext_managers() if not self.object_managers: context.log.error('no object managers configured, stopping') return # run bridge manager self.bridge_object = Bridge() self.bridge = spawn(self.bridge_object.start) # register exit handlers atexit.register(self.stop_everything) atexit.register(self.bridge_object.flush_metrics) # main cycle while True: time.sleep(5.0) # stop if was running in debug mode for more than five minutes if self.debug_mode: elapsed_time = int(time.time()) - self.start_time if elapsed_time > self.debug_mode_time: self.stop() else: print "Agent is running in debug mode, %s seconds to go..." % ( self.debug_mode_time - elapsed_time) if not self.is_running: break try: context.inc_action_id() # run internal object managers for object_manager_name in self.object_manager_order: object_manager = self.object_managers[object_manager_name] object_manager.run() # run external object managers external_object_managers = filter( lambda x: x not in self.object_manager_order, self.object_managers.keys()) for object_manager_name in external_object_managers: object_manager = self.object_managers[object_manager_name] object_manager.run() # manage external regular managers self.manage_external_managers() # talk to cloud try: if context.objects.root_object: if context.objects.root_object.definition and context.objects.root_object.definition_healthy: context.inc_action_id() self.talk_to_cloud(root_object=context.objects. root_object.definition) else: context.log.error( 'Problem with root object definition, agent stopping' ) self.stop() else: pass # context.default_log.debug('No root object defined during supervisor main run') except AmplifyCriticalException: pass self.check_bridge() except OSError as e: if e.errno == 12: # OSError errno 12 is a memory error (unable to allocate, out of memory, etc.) context.log.error('OSError: [Errno %s] %s' % (e.errno, e.message), exc_info=True) continue else: raise e def stop(self): """ Dummy for python daemon """ self.is_running = False def stop_everything(self): """ Stops all managers, collectors, etc :return: """ # stop internal managers for object_manager_name in reversed(self.object_manager_order): object_manager = self.object_managers[object_manager_name] object_manager.stop() # stop other managers ext_managers = filter( lambda name: name not in self.object_manager_order, self.object_managers.keys()) for object_manager_name in ext_managers: object_manager = self.object_managers[object_manager_name] object_manager.stop() # log agent stopped event context.log.info('agent stopped, version=%s pid=%s uuid=%s' % (context.version, context.pid, context.uuid)) def talk_to_cloud(self, root_object=None, force=False, initial=False): """ Asks cloud for config, object configs, filters, etc Applies gathered data to objects and agent config :param root_object: {} definition dict of a top object :param force: bool will skip time check :param initial: bool first run """ now = int(time.time()) if not force and (now <= (self.last_cloud_talk_time + context.app_config['cloud']['talk_interval'] + self.cloud_talk_delay) or now < context.backpressure_time): return # Handle root_object before explicitly initializing a root object if not root_object: root_object = get_root_definition() # talk to cloud try: # reset the cloud talk counter to avoid sending new requests every 5.0 seconds self.last_cloud_talk_time = int(time.time()) cloud_response = CloudResponse( context.http_client.post('agent/', data=root_object)) if self.cloud_talk_delay: self.cloud_talk_fails = 0 self.cloud_talk_delay = 0 context.log.debug( 'successful cloud connect, reset cloud talk delay') except Exception as e: if isinstance(e, HTTPError) and e.response.status_code == 503: backpressure_error = HTTP503Error(e) context.backpressure_time = int(time.time() + backpressure_error.delay) context.log.debug( 'back pressure delay %s added (next talk: %s)' % (backpressure_error.delay, context.backpressure_time)) else: self.cloud_talk_fails += 1 self.cloud_talk_delay = exponential_delay( self.cloud_talk_fails) context.log.debug( 'cloud talk delay set to %s (fails: %s)' % (self.cloud_talk_delay, self.cloud_talk_fails)) context.log.error('could not connect to cloud', exc_info=True) raise AmplifyCriticalException() # check agent version status if context.version_semver <= cloud_response.versions.obsolete: context.log.error( 'agent is obsolete - cloud will refuse updates until it is updated (version: %s, current: %s)' % (tuple_to_version(context.version_semver), tuple_to_version(cloud_response.versions.current))) self.stop() elif context.version_semver <= cloud_response.versions.old: context.log.warn( 'agent is old - update is recommended (version: %s, current: %s)' % (tuple_to_version(context.version_semver), tuple_to_version(cloud_response.versions.current))) # set capabilities for name, status in cloud_response.capabilities.iteritems(): name = ''.join([char.lower() for char in name if char.isalpha()]) context.capabilities[name] = status # update special object configs and filters changed_object_managers = set() matched_object_configs = set() for obj in cloud_response.objects: object_manager = self.object_managers.get(obj.type) if object_manager is None: continue if obj.id in object_manager.object_configs: matched_object_configs.add(obj.id) if object_manager.object_configs.get(obj.id, {}) != obj.config: context.log.info( 'object config has changed. now "%s" %s is running with: %s' % (obj.type, obj.id, pprint.pformat(obj.config))) object_manager.object_configs[obj.id] = obj.config changed_object_managers.add(obj.type) matched_object_configs.add(obj.id) # purge obsoleted object configs for object_type, object_manager in self.object_managers.iteritems(): for obj_id in object_manager.object_configs.keys(): if obj_id not in matched_object_configs: context.log.debug( 'object config has changed. now "%s" %s is running with default settings' % (object_type, obj_id)) del object_manager.object_configs[obj_id] changed_object_managers.add(object_type) # don't change api_url if a custom url was set by the user in the agent config if context.freeze_api_url: cloud_response.config.get('cloud', {}).pop('api_url', None) # global config changes def _recursive_dict_match_only_existing(kwargs1, kwargs2): for k, v1 in kwargs1.iteritems(): if isinstance(v1, dict): v2 = kwargs2.get(k, {}) if not isinstance(v2, dict): return False if not _recursive_dict_match_only_existing( v1, kwargs2.get(k, {})): return False else: if v1 != kwargs2.get(str(k)): return False return True config_changed = not _recursive_dict_match_only_existing( cloud_response.config, context.app_config.default) # apply new config context.app_config.apply(cloud_response.config, target=0) # perform restarts if config_changed or len(changed_object_managers) > 0: context.cloud_restart = True if self.bridge_object: self.bridge_object.flush_metrics() if config_changed: context.log.debug( 'app config has changed. now running with: %s' % pprint.pformat(context.app_config.config)) context.http_client.update_cloud_url() if self.object_managers: for object_manager_name in reversed( self.object_manager_order): object_manager = self.object_managers[ object_manager_name] object_manager.stop() for object_manager_name in self.external_object_manager_types: object_manager = self.object_managers[ object_manager_name] object_manager.stop() for name in self.external_managers.keys(): attr_string = '%s_manager' % name thread = getattr(self, attr_string, None) if thread is not None: thread.kill() elif len(changed_object_managers) > 0: context.log.debug('obj configs changed. changed managers: %s' % list(changed_object_managers)) for obj_type in changed_object_managers: self.object_managers[obj_type].stop() if not initial: self.init_object_managers() self.load_ext_managers() self.last_cloud_talk_restart = int(time.time()) context.cloud_restart = False def check_bridge(self): """ Check containers threads, restart if some failed """ if self.bridge.ready and self.bridge.exception: context.log.debug('bridge exception: %s' % self.bridge.exception) self.bridge = gevent.spawn(Bridge().start) def manage_external_managers(self): """ Check external managers, start/restart them if needed """ for name, manager_cls in self.external_managers.iteritems(): attr_string = '%s_manager' % name thread = getattr(self, attr_string, None) if thread is None: # start and set the manager context.log.debug('starting "%s" external manager' % manager_cls.__name__) setattr(self, attr_string, gevent.spawn(manager_cls().start)) elif thread.dead: # manager was stopped (or thread killed) context.log.debug('starting "%s" external manager after stop' % manager_cls.__name__) setattr(self, attr_string, gevent.spawn(manager_cls().start)) elif thread.ready and thread.exception: context.log.debug('restarting "%s" external manager' % manager_cls.__name__) # restart crashed managers setattr(self, attr_string, gevent.spawn(manager_cls().start))
def run(self): # get correct pid context.set_pid() # set thread name current_thread().name = 'supervisor' # get initial config from cloud self.talk_to_cloud(initial=True) # init object managers self.init_object_managers() # load ext managers self.load_ext_managers() if not self.object_managers: context.log.error('no object managers configured, stopping') return # run bridge manager self.bridge_object = Bridge() self.bridge = spawn(self.bridge_object.start) # register exit handlers atexit.register(self.stop_everything) atexit.register(self.bridge_object.flush_metrics) # main cycle while True: time.sleep(5.0) # stop if was running in debug mode for more than five minutes if self.debug_mode: elapsed_time = int(time.time()) - self.start_time if elapsed_time > self.debug_mode_time: self.stop() else: print "Agent is running in debug mode, %s seconds to go..." % ( self.debug_mode_time - elapsed_time) if not self.is_running: break try: context.inc_action_id() # run internal object managers for object_manager_name in self.object_manager_order: object_manager = self.object_managers[object_manager_name] object_manager.run() # run external object managers external_object_managers = filter( lambda x: x not in self.object_manager_order, self.object_managers.keys()) for object_manager_name in external_object_managers: object_manager = self.object_managers[object_manager_name] object_manager.run() # manage external regular managers self.manage_external_managers() # talk to cloud try: if context.objects.root_object: if context.objects.root_object.definition and context.objects.root_object.definition_healthy: context.inc_action_id() self.talk_to_cloud(root_object=context.objects. root_object.definition) else: context.log.error( 'Problem with root object definition, agent stopping' ) self.stop() else: pass # context.default_log.debug('No root object defined during supervisor main run') except AmplifyCriticalException: pass self.check_bridge() except OSError as e: if e.errno == 12: # OSError errno 12 is a memory error (unable to allocate, out of memory, etc.) context.log.error('OSError: [Errno %s] %s' % (e.errno, e.message), exc_info=True) continue else: raise e
class Supervisor(object): """ Agent supervisor Starts dedicated threads for each manager. """ # TODO: Unify the manager init and supervision process (object managers vs. bridge) MANAGER_CLASS = '%sManager' MANAGER_MODULE = 'amplify.agent.managers.%s.%s' def __init__(self, foreground=False): # daemon specific self.stdin_path = '/dev/null' if foreground: self.stdout_path = '/dev/stdout' self.stderr_path = '/dev/stderr' else: self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' self.pidfile_path = context.app_config['daemon']['pid'] self.pidfile_timeout = 1 # init self.object_managers = {} self.object_manager_order = ['system', 'nginx', 'plus'] self.bridge = None self.bridge_object = None self.start_time = int(time.time()) self.last_cloud_talk_time = 0 self.is_running = True def init_object_managers(self): """ Tries to load and create all object managers specified in config """ object_managers_from_local_config = context.app_config['containers'] for object_type in self.object_manager_order: try: object_manager_classname = self.MANAGER_CLASS % object_type.title() manager_class = loader.import_class(self.MANAGER_MODULE % (object_type, object_manager_classname)) # copy object configs if object_type in self.object_managers: object_configs = copy.copy(self.object_managers[object_type].object_configs) else: object_configs = None self.object_managers[object_type] = manager_class( object_configs=object_configs ) context.log.debug('loaded "%s" object manager from %s' % (object_type, manager_class)) except: context.log.error('failed to load %s object manager' % object_type, exc_info=True) def run(self): # get correct pid context.set_pid() # set thread name current_thread().name = 'supervisor' # get initial config from cloud self.talk_to_cloud(initial=True) # init object managers self.init_object_managers() if not self.object_managers: context.log.error('no object managers configured, stopping') return # run bridge manager self.bridge_object = Bridge() self.bridge = spawn(self.bridge_object.start) # main cycle while True: time.sleep(5.0) if not self.is_running: break try: context.inc_action_id() for object_manager_name in self.object_manager_order: object_manager = self.object_managers[object_manager_name] object_manager.run() try: if context.objects.root_object: context.inc_action_id() self.talk_to_cloud(root_object=context.objects.root_object.definition) else: pass # context.default_log.debug('No root object defined during supervisor main run') except AmplifyCriticalException: pass self.check_bridge() except OSError as e: if e.errno == 12: # OSError errno 12 is a memory error (unable to allocate, out of memory, etc.) context.log.error('OSError: [Errno %s] %s' % (e.errno, e.message), exc_info=True) continue else: raise e def stop(self): self.is_running = False if self.bridge_object: self.bridge_object.flush_metrics() for object_manager_name in reversed(self.object_manager_order): object_manager = self.object_managers[object_manager_name] object_manager.stop() def talk_to_cloud(self, root_object=None, force=False, initial=False): """ Asks cloud for config, object configs, filters, etc Applies gathered data to objects and agent config :param root_object: {} definition dict of a top object :param force: bool will skip time check :param initial: bool first run """ now = int(time.time()) if not force and now <= self.last_cloud_talk_time + context.app_config['cloud']['talk_interval']: return # talk to cloud try: self.last_cloud_talk_time = int(time.time()) cloud_response = CloudResponse( context.http_client.post('agent/', data=root_object) ) except: context.log.error('could not connect to cloud', exc_info=True) raise AmplifyCriticalException() # check agent version status if context.version_major <= float(cloud_response.versions.obsolete): context.log.error( 'agent is obsolete - cloud will refuse updates until it is updated (version: %s, current: %s)' % (context.version_major, cloud_response.versions.current) ) self.stop() elif context.version_major <= float(cloud_response.versions.old): context.log.warn( 'agent is old - update is recommended (version: %s, current: %s)' % (context.version_major, cloud_response.versions.current) ) # update special object configs and filters changed_object_managers = set() matched_object_configs = set() for obj in cloud_response.objects: object_manager = self.object_managers.get(obj.type) if not object_manager: continue if obj.id in object_manager.object_configs: matched_object_configs.add(obj.id) if object_manager.object_configs.get(obj.id, {}) != obj.config: context.log.info( 'object config has changed. now "%s" %s is running with: %s' % (obj.type, obj.id, pprint.pformat(obj.config)) ) object_manager.object_configs[obj.id] = obj.config changed_object_managers.add(obj.type) matched_object_configs.add(obj.id) # purge obsoleted object configs for object_type, object_manager in self.object_managers.iteritems(): for obj_id in object_manager.object_configs.keys(): if obj_id not in matched_object_configs: context.log.debug( 'object config has changed. now "%s" %s is running with default settings' % (object_type, obj_id) ) del object_manager.object_configs[obj_id] changed_object_managers.add(object_type) # global config changes config_changed = context.app_config.apply(cloud_response.config) # perform restarts if config_changed or len(changed_object_managers) > 0: context.cloud_restart = True if self.bridge_object: self.bridge_object.flush_metrics() if config_changed: context.log.debug( 'app config has changed. now running with: %s' % pprint.pformat(context.app_config.config) ) context.http_client.update_cloud_url() if self.object_managers: for object_manager_name in reversed(self.object_manager_order): object_manager = self.object_managers[object_manager_name] object_manager.stop() elif len(changed_object_managers) > 0: for obj_type in changed_object_managers: self.object_managers[obj_type].stop() if not initial: self.init_object_managers() context.cloud_restart = False def check_bridge(self): """ Check containers threads, restart if some failed """ if self.bridge.ready and self.bridge.exception: context.log.debug('bridge exception: %s' % self.bridge.exception) self.bridge = gevent.spawn(Bridge().start)
class Supervisor(object): """ Agent supervisor Starts dedicated threads for each manager. """ # TODO: Unify the manager init and supervision process (object managers vs. bridge) MANAGER_CLASS = '%sManager' MANAGER_MODULE = 'amplify.agent.managers.%s.%s' def __init__(self, foreground=False): # daemon specific self.stdin_path = '/dev/null' if foreground: self.stdout_path = '/dev/stdout' self.stderr_path = '/dev/stderr' else: self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' self.pidfile_path = context.app_config['daemon']['pid'] self.pidfile_timeout = 1 # init self.object_managers = {} self.object_manager_order = ['system', 'nginx', 'plus'] self.bridge = None self.bridge_object = None self.start_time = int(time.time()) self.last_cloud_talk_time = 0 self.is_running = True def init_object_managers(self): """ Tries to load and create all object managers specified in config """ object_managers_from_local_config = context.app_config['containers'] for object_type in self.object_manager_order: try: object_manager_classname = self.MANAGER_CLASS % object_type.title( ) manager_class = loader.import_class( self.MANAGER_MODULE % (object_type, object_manager_classname)) # copy object configs if object_type in self.object_managers: object_configs = copy.copy( self.object_managers[object_type].object_configs) else: object_configs = None self.object_managers[object_type] = manager_class( object_configs=object_configs) context.log.debug('loaded "%s" object manager from %s' % (object_type, manager_class)) except: context.log.error('failed to load %s object manager' % object_type, exc_info=True) def run(self): # get correct pid context.set_pid() # set thread name current_thread().name = 'supervisor' # get initial config from cloud self.talk_to_cloud(initial=True) # init object managers self.init_object_managers() if not self.object_managers: context.log.error('no object managers configured, stopping') return # run bridge manager self.bridge_object = Bridge() self.bridge = spawn(self.bridge_object.start) # main cycle while True: time.sleep(5.0) if not self.is_running: break try: context.inc_action_id() for object_manager_name in self.object_manager_order: object_manager = self.object_managers[object_manager_name] object_manager.run() try: if context.objects.root_object: context.inc_action_id() self.talk_to_cloud( root_object=context.objects.root_object.definition) else: pass # context.default_log.debug('No root object defined during supervisor main run') except AmplifyCriticalException: pass self.check_bridge() except OSError as e: if e.errno == 12: # OSError errno 12 is a memory error (unable to allocate, out of memory, etc.) context.log.error('OSError: [Errno %s] %s' % (e.errno, e.message), exc_info=True) continue else: raise e def stop(self): self.is_running = False if self.bridge_object: self.bridge_object.flush_metrics() for object_manager_name in reversed(self.object_manager_order): object_manager = self.object_managers[object_manager_name] object_manager.stop() def talk_to_cloud(self, root_object=None, force=False, initial=False): """ Asks cloud for config, object configs, filters, etc Applies gathered data to objects and agent config :param root_object: {} definition dict of a top object :param force: bool will skip time check :param initial: bool first run """ now = int(time.time()) if not force and now <= self.last_cloud_talk_time + context.app_config[ 'cloud']['talk_interval']: return # talk to cloud try: cloud_response = CloudResponse( context.http_client.post('agent/', data=root_object)) except: context.log.error('could not connect to cloud', exc_info=True) raise AmplifyCriticalException() # check agent version status if context.version_major <= float(cloud_response.versions.obsolete): context.log.error( 'agent is obsolete - cloud will refuse updates until it is updated (version: %s, current: %s)' % (context.version_major, cloud_response.versions.current)) self.stop() elif context.version_major <= float(cloud_response.versions.old): context.log.warn( 'agent is old - update is recommended (version: %s, current: %s)' % (context.version_major, cloud_response.versions.current)) # update special object configs and filters changed_object_managers = set() matched_object_configs = set() for obj in cloud_response.objects: object_manager = self.object_managers.get(obj.type) if not object_manager: continue if obj.id in object_manager.object_configs: matched_object_configs.add(obj.id) if object_manager.object_configs.get(obj.id, {}) != obj.config: context.log.info( 'object config has changed. now "%s" %s is running with: %s' % (obj.type, obj.id, pprint.pformat(obj.config))) object_manager.object_configs[obj.id] = obj.config changed_object_managers.add(obj.type) matched_object_configs.add(obj.id) # purge obsoleted object configs for object_type, object_manager in self.object_managers.iteritems(): for obj_id in object_manager.object_configs.keys(): if obj_id not in matched_object_configs: context.log.debug( 'object config has changed. now "%s" %s is running with default settings' % (object_type, obj_id)) del object_manager.object_configs[obj_id] changed_object_managers.add(object_type) # global config changes config_changed = context.app_config.apply(cloud_response.config) # perform restarts if config_changed or len(changed_object_managers) > 0: context.cloud_restart = True if self.bridge_object: self.bridge_object.flush_metrics() if config_changed: context.log.debug( 'app config has changed. now running with: %s' % pprint.pformat(context.app_config.config)) context.http_client.update_cloud_url() if self.object_managers: for object_manager_name in reversed( self.object_manager_order): object_manager = self.object_managers[ object_manager_name] object_manager.stop() elif len(changed_object_managers) > 0: for obj_type in changed_object_managers: self.object_managers[obj_type].stop() if not initial: self.init_object_managers() context.cloud_restart = False self.last_cloud_talk_time = int(time.time()) def check_bridge(self): """ Check containers threads, restart if some failed """ if self.bridge.ready and self.bridge.exception: context.log.debug('bridge exception: %s' % self.bridge.exception) self.bridge = gevent.spawn(Bridge().start)
class Supervisor(object): """ Agent supervisor Starts dedicated threads for each manager. """ # TODO: Unify the manager init and supervision process (object managers vs. bridge) MANAGER_CLASS = '%sManager' MANAGER_MODULE = 'amplify.agent.managers.%s.%s' def __init__(self, foreground=False): # daemon specific self.stdin_path = '/dev/null' if foreground: self.stdout_path = '/dev/stdout' self.stderr_path = '/dev/stderr' else: self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' self.pidfile_path = context.app_config['daemon']['pid'] self.pidfile_timeout = 1 # init self.object_managers = {} self.object_manager_order = ['system', 'nginx', 'plus'] self.bridge = None self.bridge_object = None self.start_time = int(time.time()) self.last_cloud_talk_time = 0 self.cloud_talk_fails = 0 self.cloud_talk_delay = 0 self.is_running = True def init_object_managers(self): """ Tries to load and create all internal object managers specified in config """ for object_type in self.object_manager_order: try: object_manager_classname = self.MANAGER_CLASS % object_type.title( ) manager_class = loader.import_class( self.MANAGER_MODULE % (object_type, object_manager_classname)) # copy object configs if object_type in self.object_managers: object_configs = copy.copy( self.object_managers[object_type].object_configs) else: object_configs = None self.object_managers[object_type] = manager_class( object_configs=object_configs) context.log.debug('loaded "%s" object manager from %s' % (object_type, manager_class)) except: context.log.error('failed to load %s object manager' % object_type, exc_info=True) def load_ext_managers(self): """ Tries to load and create all ext managers to be run during primary event loop. """ import pkgutil import inspect import amplify.ext as extensions from amplify.agent.managers.abstract import ObjectManager from amplify.agent.common.util.configtypes import boolean base_prefix = extensions.__name__ + '.' # 'amplify.ext.' def _recursive_manager_init(inspected_package, prefix=base_prefix): """ Takes a package and iterates all of the modules. If it's a module (e.g. not a package), it will look for ObjectManager class definitions and add and instance of it to the object_managers store. :param inspected_package: Package """ # iter all modules in package for _, modname, ispkg in pkgutil.iter_modules( inspected_package.__path__): current_loc = prefix + modname # import module mod = __import__(current_loc, fromlist='dummy') if ispkg: # if it is another package, recursively call this function current_prefix = current_loc + '.' _recursive_manager_init(mod, prefix=current_prefix) else: # otherwise if it is a module walk the objects to find ObjectManagers for obj in mod.__dict__.itervalues(): # if it is a class defintion if inspect.isclass(obj): # and it is a subclass of ObjectManager (but not ObjectManager itself) if issubclass( obj, ObjectManager ) and obj.__name__ != ObjectManager.__name__: # check that the extension is enabled in the config if obj.ext in context.app_config.get('extensions', {}) and \ boolean(context.app_config.get('extensions', {}).get(obj.ext, False)): # add to object_managers self.object_managers[obj.type] = obj() context.log.debug( 'loaded "%s" object manager from %s' % (obj.type, obj)) else: context.log.debug( 'ignored "%s" object manager from %s' % (obj.type, obj)) _recursive_manager_init( extensions) # start the recursive loading process def run(self): # get correct pid context.set_pid() # set thread name current_thread().name = 'supervisor' # get initial config from cloud self.talk_to_cloud(initial=True) # init object managers self.init_object_managers() # load ext managers self.load_ext_managers() if not self.object_managers: context.log.error('no object managers configured, stopping') return # run bridge manager self.bridge_object = Bridge() self.bridge = spawn(self.bridge_object.start) # main cycle while True: time.sleep(5.0) if not self.is_running: break try: context.inc_action_id() # run internal objects for object_manager_name in self.object_manager_order: object_manager = self.object_managers[object_manager_name] object_manager.run() # run exeternal objects external_managers = filter( lambda x: x not in self.object_manager_order, self.object_managers.keys()) for object_manager_name in external_managers: object_manager = self.object_managers[object_manager_name] object_manager.run() # talk to cloud try: if context.objects.root_object: if context.objects.root_object.definition and context.objects.root_object.definition_healthy: context.inc_action_id() self.talk_to_cloud(root_object=context.objects. root_object.definition) else: context.log.error( 'Problem with root object definition, agent stopping' ) self.stop() else: pass # context.default_log.debug('No root object defined during supervisor main run') except AmplifyCriticalException: pass self.check_bridge() except OSError as e: if e.errno == 12: # OSError errno 12 is a memory error (unable to allocate, out of memory, etc.) context.log.error('OSError: [Errno %s] %s' % (e.errno, e.message), exc_info=True) continue else: raise e def stop(self): self.is_running = False if self.bridge_object: self.bridge_object.flush_metrics() for object_manager_name in reversed(self.object_manager_order): object_manager = self.object_managers[object_manager_name] object_manager.stop() def talk_to_cloud(self, root_object=None, force=False, initial=False): """ Asks cloud for config, object configs, filters, etc Applies gathered data to objects and agent config :param root_object: {} definition dict of a top object :param force: bool will skip time check :param initial: bool first run """ now = int(time.time()) if not force and (now <= (self.last_cloud_talk_time + context.app_config['cloud']['talk_interval'] + self.cloud_talk_delay) or now < context.backpressure_time): return # Handle root_object before explicitly initializing a root object if not root_object: root_object = get_root_definition() # talk to cloud try: # reset the cloud talk counter to avoid sending new requests every 5.0 seconds self.last_cloud_talk_time = int(time.time()) cloud_response = CloudResponse( context.http_client.post('agent/', data=root_object)) if self.cloud_talk_delay: self.cloud_talk_fails = 0 self.cloud_talk_delay = 0 context.log.debug( 'successful cloud connect, reset cloud talk delay') except Exception as e: if isinstance(e, HTTPError) and e.response.status_code == 503: backpressure_error = HTTP503Error(e) context.backpressure_time = int(time.time() + backpressure_error.delay) context.log.debug( 'back pressure delay %s added (next talk: %s)' % (backpressure_error.delay, context.backpressure_time)) else: self.cloud_talk_fails += 1 self.cloud_talk_delay = exponential_delay( self.cloud_talk_fails) context.log.debug( 'cloud talk delay set to %s (fails: %s)' % (self.cloud_talk_delay, self.cloud_talk_fails)) context.log.error('could not connect to cloud', exc_info=True) raise AmplifyCriticalException() # check agent version status if context.version_major <= float(cloud_response.versions.obsolete): context.log.error( 'agent is obsolete - cloud will refuse updates until it is updated (version: %s, current: %s)' % (context.version_major, cloud_response.versions.current)) self.stop() elif context.version_major <= float(cloud_response.versions.old): context.log.warn( 'agent is old - update is recommended (version: %s, current: %s)' % (context.version_major, cloud_response.versions.current)) # update special object configs and filters changed_object_managers = set() matched_object_configs = set() for obj in cloud_response.objects: object_manager = self.object_managers.get(obj.type) if not object_manager: continue if obj.id in object_manager.object_configs: matched_object_configs.add(obj.id) if object_manager.object_configs.get(obj.id, {}) != obj.config: context.log.info( 'object config has changed. now "%s" %s is running with: %s' % (obj.type, obj.id, pprint.pformat(obj.config))) object_manager.object_configs[obj.id] = obj.config changed_object_managers.add(obj.type) matched_object_configs.add(obj.id) # purge obsoleted object configs for object_type, object_manager in self.object_managers.iteritems(): for obj_id in object_manager.object_configs.keys(): if obj_id not in matched_object_configs: context.log.debug( 'object config has changed. now "%s" %s is running with default settings' % (object_type, obj_id)) del object_manager.object_configs[obj_id] changed_object_managers.add(object_type) # don't change api_url if a custom url was set by the user in the agent config if context.freeze_api_url: cloud_response.config.get('cloud', {}).pop('api_url', None) # global config changes config_changed = context.app_config.apply(cloud_response.config) # perform restarts if config_changed or len(changed_object_managers) > 0: context.cloud_restart = True if self.bridge_object: self.bridge_object.flush_metrics() if config_changed: context.log.debug( 'app config has changed. now running with: %s' % pprint.pformat(context.app_config.config)) context.http_client.update_cloud_url() if self.object_managers: for object_manager_name in reversed( self.object_manager_order): object_manager = self.object_managers[ object_manager_name] object_manager.stop() elif len(changed_object_managers) > 0: for obj_type in changed_object_managers: self.object_managers[obj_type].stop() if not initial: self.init_object_managers() context.cloud_restart = False def check_bridge(self): """ Check containers threads, restart if some failed """ if self.bridge.ready and self.bridge.exception: context.log.debug('bridge exception: %s' % self.bridge.exception) self.bridge = gevent.spawn(Bridge().start)