class HighAvailabilityAgent(ResourceAgent): """Agent to manage high availability processes """ def __init__(self): log.debug("HighAvailabilityAgent init") ResourceAgent.__init__(self) def on_init(self): if not HighAvailabilityCore: msg = "HighAvailabilityCore isn't available. Use production.cfg buildout" log.error(msg) return log.debug("HighAvailabilityCore Pyon on_init") policy_name = self.CFG.get_safe("highavailability.policy.name") if policy_name is None: msg = "HA service requires a policy name at CFG.highavailability.policy.name" raise Exception(msg) try: self.policy = policy.policy_map[policy_name.lower()] except KeyError: raise Exception("HA Service doesn't support '%s' policy" % policy_name) policy_parameters = self.CFG.get_safe("highavailability.policy.parameters") self.policy_interval = self.CFG.get_safe("highavailability.policy.interval", DEFAULT_INTERVAL) cfg = self.CFG.get_safe("highavailability") pds = self.CFG.get_safe("highavailability.process_dispatchers", []) process_spec = self.CFG.get_safe("highavailability.process_spec") # TODO: Allow other core class? self.core = HighAvailabilityCore(cfg, ProcessDispatcherSimpleAPIClient, pds, process_spec, self.policy) self.policy_thread = looping_call(self.policy_interval, self.core.apply_policy) def on_quit(self): self.policy_thread.kill(block=True, timeout=3) def rcmd_reconfigure_policy(self, new_policy): """Service operation: Change the parameters of the policy used for service @param new_policy: parameters of policy @return: """ self.core.reconfigure_policy(new_policy) def rcmd_status(self): """Service operation: Get the status of the HA Service @return: {PENDING, READY, STEADY, BROKEN} """ return self.core.status() def rcmd_dump(self): return self.core.dump()
class HighAvailabilityAgent(SimpleResourceAgent): """Agent to manage high availability processes """ def __init__(self): SimpleResourceAgent.__init__(self) self.dashi_handler = None self.service_id = None self.policy_thread = None self.policy_event = None def on_init(self): if not HighAvailabilityCore: msg = "HighAvailabilityCore isn't available. Use autolaunch.cfg buildout" log.error(msg) return cfg = self.CFG.get_safe("highavailability") # use default PD name as the sole PD if none are provided in config self.pds = self.CFG.get_safe("highavailability.process_dispatchers", [ProcessDispatcherService.name]) if not len(self.pds) == 1: raise Exception("HA Service doesn't support multiple Process Dispatchers") self.process_definition_id, self.process_definition = self._get_process_definition() self.process_configuration = self.CFG.get_safe("highavailability.process_configuration") aggregator_config = _get_aggregator_config(self.CFG) self.service_id, self.service_name = self._register_service() self.policy_event = Event() stored_policy = self._stored_policy if stored_policy != {}: policy_name = stored_policy.get('name') policy_parameters = stored_policy.get('parameters') self._validate_policy_name(policy_name) self.policy_name = policy_name.lower() self.policy_parameters = policy_parameters else: policy_name = self.CFG.get_safe("highavailability.policy.name") self._validate_policy_name(policy_name) self.policy_name = policy_name.lower() self.policy_parameters = self.CFG.get_safe("highavailability.policy.parameters") self.policy_interval = self.CFG.get_safe("highavailability.policy.interval", DEFAULT_INTERVAL) self.logprefix = "HA Agent (%s): " % self.service_name self.control = HAProcessControl(self.pds[0], self.container.resource_registry, self.service_id, self.policy_event.set, logprefix=self.logprefix) self.core = HighAvailabilityCore(cfg, self.control, self.pds, self.policy_name, process_definition_id=self.process_definition_id, parameters=self.policy_parameters, process_configuration=self.process_configuration, aggregator_config=aggregator_config, name=self.service_name) dashi_messaging = self.CFG.get_safe("highavailability.dashi_messaging", False) if dashi_messaging: dashi_name = self.CFG.get_safe("highavailability.dashi_name") if not dashi_name: raise Exception("dashi_name unknown") dashi_uri = self.CFG.get_safe("highavailability.dashi_uri") if not dashi_uri: rabbit_host = self.CFG.get_safe("server.amqp.host") rabbit_user = self.CFG.get_safe("server.amqp.username") rabbit_pass = self.CFG.get_safe("server.amqp.password") if not (rabbit_host and rabbit_user and rabbit_pass): raise Exception("cannot form dashi URI") dashi_uri = "amqp://%s:%s@%s/" % (rabbit_user, rabbit_pass, rabbit_host) dashi_exchange = self.CFG.get_safe("highavailability.dashi_exchange") if not dashi_exchange: dashi_exchange = get_sys_name() self.dashi_handler = HADashiHandler(self, dashi_name, dashi_uri, dashi_exchange) else: self.dashi_handler = None def _get_process_definition(self): process_definition_id = self.CFG.get_safe("highavailability.process_definition_id") process_definition_name = self.CFG.get_safe("highavailability.process_definition_name") if process_definition_id: pd_name = self.pds[0] pd = ProcessDispatcherServiceClient(to_name=pd_name) definition = pd.read_process_definition(process_definition_id) elif process_definition_name: definitions, _ = self.container.resource_registry.find_resources( restype="ProcessDefinition", name=process_definition_name) if len(definitions) == 0: raise Exception("Process definition with name '%s' not found" % process_definition_name) elif len(definitions) > 1: raise Exception("multiple process definitions found with name '%s'" % process_definition_name) definition = definitions[0] process_definition_id = definition._id else: raise Exception("HA Agent requires either process definition ID or name") return process_definition_id, definition def on_start(self): if self.dashi_handler: self.dashi_handler.start() self.control.start() # override the core's list of currently managed processes. This is to support # restart of an HAAgent. self.core.set_managed_upids(self.control.get_managed_upids()) self.policy_thread = gevent.spawn(self._policy_thread_loop) # kickstart the policy once. future invocations will happen via event callbacks. self.policy_event.set() def on_quit(self): self.control.stop() self.policy_thread.kill(block=True, timeout=3) if self.dashi_handler: self.dashi_handler.stop() # DL: do we ever want to remove this object? #self._unregister_service() def _register_service(self): definition = self.process_definition existing_services, _ = self.container.resource_registry.find_resources( restype="Service", name=definition.name) if len(existing_services) > 0: if len(existing_services) > 1: log.warning("There is more than one service object for %s. Using the first one" % definition.name) service_id = existing_services[0]._id else: svc_obj = Service(name=definition.name, exchange_name=definition.name) service_id, _ = self.container.resource_registry.create(svc_obj) svcdefs, _ = self.container.resource_registry.find_resources( restype="ServiceDefinition", name=definition.name) if svcdefs: try: self.container.resource_registry.create_association( service_id, "hasServiceDefinition", svcdefs[0]._id) except BadRequest: log.warn("Failed to associate %s Service and ServiceDefinition. It probably exists.", definition.name) else: log.error("Cannot find ServiceDefinition resource for %s", definition.name) return service_id, definition.name def _unregister_service(self): if not self.service_id: log.error("No service id. Cannot unregister service") return self.container.resource_registry.delete(self.service_id, del_associations=True) def _policy_thread_loop(self): """Single thread runs policy loops, to prevent races """ while True: # wait until our event is set, up to policy_interval seconds self.policy_event.wait(self.policy_interval) if self.policy_event.is_set(): self.policy_event.clear() log.debug("%sapplying policy due to event", self.logprefix) else: # on a regular basis, we check for the current state of each process. # this is essentially a hedge against bugs in the HAAgent, or in the # ION events system that could prevent us from seeing state changes # of processes. log.debug("%sapplying policy due to timer. Reloading process cache first.", self.logprefix) try: self.control.reload_processes() except (Exception, gevent.Timeout): log.warn("%sFailed to reload processes from PD. Will retry later.", self.logprefix, exc_info=True) try: self._apply_policy() except (Exception, gevent.Timeout): log.warn("%sFailed to apply policy. Will retry later.", self.logprefix, exc_info=True) def _validate_policy_name(self, policy_name): if policy_name is None: msg = "HA service requires a policy name at CFG.highavailability.policy.name" raise Exception(msg) try: policy.policy_map[policy_name.lower()] except KeyError: raise Exception("HA Service doesn't support '%s' policy" % policy_name) @property def _policy_dict(self): policy_dict = { 'name': self.core.policy_type, 'parameters': self.core.policy.parameters } return policy_dict @property def _stored_policy(self): service = self.container.resource_registry.read(self.service_id) return service.policy def _apply_policy(self): self.core.apply_policy() try: new_service_state = _core_hastate_to_service_state(self.core.status()) new_policy = self._policy_dict service = self.container.resource_registry.read(self.service_id) update_service = False if service.state != new_service_state: service.state = new_service_state update_service = True if service.policy != new_policy: service.policy = new_policy update_service = True if update_service is True: self.container.resource_registry.update(service) except Exception: log.warn("%sProblem when updating Service state", self.logprefix, exc_info=True) def rcmd_reconfigure_policy(self, new_policy_params, new_policy_name=None): """Service operation: Change the parameters of the policy used for service @param new_policy_params: parameters of policy @param new_policy_name: name of policy @return: """ self.core.reconfigure_policy(new_policy_params, new_policy_name) #trigger policy thread to wake up self.policy_event.set() def rcmd_status(self): """Service operation: Get the status of the HA Service @return: {PENDING, READY, STEADY, BROKEN} """ return self.core.status() def rcmd_dump(self): dump = self.core.dump() dump['service_id'] = self.service_id return dump
class HighAvailabilityAgent(SimpleResourceAgent): """Agent to manage high availability processes """ def __init__(self): SimpleResourceAgent.__init__(self) self.dashi_handler = None self.service_id = None self.policy_thread = None self.policy_event = None self._policy_loop_event = Event() def on_init(self): if not HighAvailabilityCore: msg = "HighAvailabilityCore isn't available. Use autolaunch.cfg buildout" log.error(msg) return cfg = self.CFG.get_safe("highavailability") # use default PD name as the sole PD if none are provided in config self.pds = self.CFG.get_safe("highavailability.process_dispatchers", [ProcessDispatcherService.name]) if not len(self.pds) == 1: raise Exception( "HA Service doesn't support multiple Process Dispatchers") self.process_definition_id, self.process_definition = self._get_process_definition( ) self.process_configuration = self.CFG.get_safe( "highavailability.process_configuration") aggregator_config = _get_aggregator_config(self.CFG) self.service_id, self.service_name = self._register_service() self.policy_event = Event() stored_policy = self._stored_policy if stored_policy != {}: policy_name = stored_policy.get('name') policy_parameters = stored_policy.get('parameters') self._validate_policy_name(policy_name) self.policy_name = policy_name.lower() self.policy_parameters = policy_parameters else: policy_name = self.CFG.get_safe("highavailability.policy.name") self._validate_policy_name(policy_name) self.policy_name = policy_name.lower() self.policy_parameters = self.CFG.get_safe( "highavailability.policy.parameters") self.policy_interval = self.CFG.get_safe( "highavailability.policy.interval", DEFAULT_INTERVAL) self.logprefix = "HA Agent (%s): " % self.service_name self.control = HAProcessControl(self.pds[0], self.container.resource_registry, self.service_id, self.policy_event.set, logprefix=self.logprefix) self.core = HighAvailabilityCore( cfg, self.control, self.pds, self.policy_name, process_definition_id=self.process_definition_id, parameters=self.policy_parameters, process_configuration=self.process_configuration, aggregator_config=aggregator_config, name=self.service_name) dashi_messaging = self.CFG.get_safe("highavailability.dashi_messaging", False) if dashi_messaging: dashi_name = self.CFG.get_safe("highavailability.dashi_name") if not dashi_name: raise Exception("dashi_name unknown") dashi_uri = self.CFG.get_safe("highavailability.dashi_uri") if not dashi_uri: rabbit_host = self.CFG.get_safe("server.amqp.host") rabbit_user = self.CFG.get_safe("server.amqp.username") rabbit_pass = self.CFG.get_safe("server.amqp.password") if not (rabbit_host and rabbit_user and rabbit_pass): raise Exception("cannot form dashi URI") dashi_uri = "amqp://%s:%s@%s/" % (rabbit_user, rabbit_pass, rabbit_host) dashi_exchange = self.CFG.get_safe( "highavailability.dashi_exchange") if not dashi_exchange: dashi_exchange = get_sys_name() self.dashi_handler = HADashiHandler(self, dashi_name, dashi_uri, dashi_exchange) else: self.dashi_handler = None def _get_process_definition(self): process_definition_id = self.CFG.get_safe( "highavailability.process_definition_id") process_definition_name = self.CFG.get_safe( "highavailability.process_definition_name") if process_definition_id: pd_name = self.pds[0] pd = ProcessDispatcherServiceClient(to_name=pd_name) definition = pd.read_process_definition(process_definition_id) elif process_definition_name: definitions, _ = self.container.resource_registry.find_resources( restype="ProcessDefinition", name=process_definition_name) if len(definitions) == 0: raise Exception("Process definition with name '%s' not found" % process_definition_name) elif len(definitions) > 1: raise Exception( "multiple process definitions found with name '%s'" % process_definition_name) definition = definitions[0] process_definition_id = definition._id else: raise Exception( "HA Agent requires either process definition ID or name") return process_definition_id, definition def on_start(self): if self.dashi_handler: self.dashi_handler.start() self.control.start() # override the core's list of currently managed processes. This is to support # restart of an HAAgent. self.core.set_managed_upids(self.control.get_managed_upids()) self.policy_thread = gevent.spawn(self._policy_thread_loop) # kickstart the policy once. future invocations will happen via event callbacks. self.policy_event.set() def on_quit(self): self.control.stop() self._policy_loop_event.set() self.policy_thread.join() self.policy_thread.kill(block=True, timeout=3) if self.dashi_handler: self.dashi_handler.stop() # DL: do we ever want to remove this object? #self._unregister_service() def _register_service(self): definition = self.process_definition existing_services, _ = self.container.resource_registry.find_resources( restype="Service", name=definition.name) if len(existing_services) > 0: if len(existing_services) > 1: log.warning( "There is more than one service object for %s. Using the first one" % definition.name) service_id = existing_services[0]._id else: svc_obj = Service(name=definition.name, exchange_name=definition.name) service_id, _ = self.container.resource_registry.create(svc_obj) svcdefs, _ = self.container.resource_registry.find_resources( restype="ServiceDefinition", name=definition.name) if svcdefs: try: self.container.resource_registry.create_association( service_id, "hasServiceDefinition", svcdefs[0]._id) except BadRequest: log.warn( "Failed to associate %s Service and ServiceDefinition. It probably exists.", definition.name) else: log.error("Cannot find ServiceDefinition resource for %s", definition.name) return service_id, definition.name def _unregister_service(self): if not self.service_id: log.error("No service id. Cannot unregister service") return self.container.resource_registry.delete(self.service_id, del_associations=True) def _policy_thread_loop(self): """Single thread runs policy loops, to prevent races """ while not self._policy_loop_event.wait(timeout=0.1): # wait until our event is set, up to policy_interval seconds self.policy_event.wait(self.policy_interval) if self.policy_event.is_set(): self.policy_event.clear() log.debug("%sapplying policy due to event", self.logprefix) else: # on a regular basis, we check for the current state of each process. # this is essentially a hedge against bugs in the HAAgent, or in the # ION events system that could prevent us from seeing state changes # of processes. log.debug( "%sapplying policy due to timer. Reloading process cache first.", self.logprefix) try: self.control.reload_processes() except (Exception, gevent.Timeout): log.warn( "%sFailed to reload processes from PD. Will retry later.", self.logprefix, exc_info=True) try: self._apply_policy() except (Exception, gevent.Timeout): log.warn("%sFailed to apply policy. Will retry later.", self.logprefix, exc_info=True) def _validate_policy_name(self, policy_name): if policy_name is None: msg = "HA service requires a policy name at CFG.highavailability.policy.name" raise Exception(msg) try: policy.policy_map[policy_name.lower()] except KeyError: raise Exception("HA Service doesn't support '%s' policy" % policy_name) @property def _policy_dict(self): policy_dict = { 'name': self.core.policy_type, 'parameters': self.core.policy.parameters } return policy_dict @property def _stored_policy(self): service = self.container.resource_registry.read(self.service_id) return service.policy def _apply_policy(self): self.core.apply_policy() try: new_service_state = _core_hastate_to_service_state( self.core.status()) new_policy = self._policy_dict service = self.container.resource_registry.read(self.service_id) update_service = False if service.state != new_service_state: service.state = new_service_state update_service = True if service.policy != new_policy: service.policy = new_policy update_service = True if update_service is True: self.container.resource_registry.update(service) except Exception: log.warn("%sProblem when updating Service state", self.logprefix, exc_info=True) def rcmd_reconfigure_policy(self, new_policy_params, new_policy_name=None): """Service operation: Change the parameters of the policy used for service @param new_policy_params: parameters of policy @param new_policy_name: name of policy @return: """ self.core.reconfigure_policy(new_policy_params, new_policy_name) #trigger policy thread to wake up self.policy_event.set() def rcmd_status(self): """Service operation: Get the status of the HA Service @return: {PENDING, READY, STEADY, BROKEN} """ return self.core.status() def rcmd_dump(self): dump = self.core.dump() dump['service_id'] = self.service_id return dump
class HighAvailabilityAgent(SimpleResourceAgent): """Agent to manage high availability processes """ def __init__(self): log.debug("HighAvailabilityAgent init") SimpleResourceAgent.__init__(self) self.dashi_handler = None self.service_id = None def on_init(self): if not HighAvailabilityCore: msg = "HighAvailabilityCore isn't available. Use autolaunch.cfg buildout" log.error(msg) return log.debug("HighAvailabilityCore Pyon on_init") policy_name = self.CFG.get_safe("highavailability.policy.name") if policy_name is None: msg = "HA service requires a policy name at CFG.highavailability.policy.name" raise Exception(msg) try: self.policy = policy.policy_map[policy_name.lower()] except KeyError: raise Exception("HA Service doesn't support '%s' policy" % policy_name) policy_parameters = self.CFG.get_safe("highavailability.policy.parameters") self.policy_interval = self.CFG.get_safe("highavailability.policy.interval", DEFAULT_INTERVAL) cfg = self.CFG.get_safe("highavailability") # use default PD name as the sole PD if none are provided in config self.pds = self.CFG.get_safe("highavailability.process_dispatchers", [ProcessDispatcherService.name]) self.process_definition_id = self.CFG.get_safe("highavailability.process_definition_id") self.process_configuration = self.CFG.get_safe("highavailability.process_configuration") aggregator_config = self.CFG.get_safe("highavailability.aggregator") self.service_id = self._register_service() # TODO: Allow other core class? self.core = HighAvailabilityCore(cfg, ProcessDispatcherSimpleAPIClient, self.pds, self.policy, process_definition_id=self.process_definition_id, parameters=policy_parameters, process_configuration=self.process_configuration, aggregator_config=aggregator_config, pd_client_kwargs={'container': self.container, 'service_id': self.service_id}) self.policy_thread = looping_call(self.policy_interval, self.core.apply_policy) dashi_messaging = self.CFG.get_safe("highavailability.dashi_messaging", False) if dashi_messaging: dashi_name = self.CFG.get_safe("highavailability.dashi_name") if not dashi_name: raise Exception("dashi_name unknown") dashi_uri = self.CFG.get_safe("highavailability.dashi_uri") if not dashi_uri: rabbit_host = self.CFG.get_safe("server.amqp.host") rabbit_user = self.CFG.get_safe("server.amqp.username") rabbit_pass = self.CFG.get_safe("server.amqp.password") if not (rabbit_host and rabbit_user and rabbit_pass): raise Exception("cannot form dashi URI") dashi_uri = "amqp://%s:%s@%s/" % (rabbit_user, rabbit_pass, rabbit_host) dashi_exchange = self.CFG.get_safe("highavailability.dashi_exchange") if not dashi_exchange: dashi_exchange = get_sys_name() self.dashi_handler = HADashiHandler(self, dashi_name, dashi_uri, dashi_exchange) else: self.dashi_handler = None def on_start(self): if self.dashi_handler: self.dashi_handler.start() def on_quit(self): self.policy_thread.kill(block=True, timeout=3) if self.dashi_handler: self.dashi_handler.stop() self._unregister_service() def _register_service(self): if not self.process_definition_id: log.error("No process definition id. Not registering service") return if len(self.pds) < 1: log.error("Must have at least one PD available to register a service") return pd_name = self.pds[0] pd = ProcessDispatcherServiceClient(to_name=pd_name) definition = pd.read_process_definition(self.process_definition_id) existing_services, _ = self.container.resource_registry.find_resources( restype="Service", name=definition.name) if len(existing_services) > 0: if len(existing_services) > 1: log.warning("There is more than one service object for %s. Using the first one" % definition.name) service_id = existing_services[0]._id else: svc_obj = Service(name=definition.name, exchange_name=definition.name) service_id, _ = self.container.resource_registry.create(svc_obj) svcdefs, _ = self.container.resource_registry.find_resources( restype="ServiceDefinition", name=definition.name) if svcdefs: self.container.resource_registry.create_association( service_id, "hasServiceDefinition", svcdefs[0]._id) else: log.error("Cannot find ServiceDefinition resource for %s", definition.name) return service_id def _unregister_service(self): if not self.service_id: log.error("No service id. Cannot unregister service") return self.container.resource_registry.delete(self.service_id, del_associations=True) def rcmd_reconfigure_policy(self, new_policy): """Service operation: Change the parameters of the policy used for service @param new_policy: parameters of policy @return: """ self.core.reconfigure_policy(new_policy) def rcmd_status(self): """Service operation: Get the status of the HA Service @return: {PENDING, READY, STEADY, BROKEN} """ return self.core.status() def rcmd_dump(self): dump = self.core.dump() dump['service_id'] = self.service_id return dump