def test_error_caught(self): self.loop = LoopingCall(self.looper) self.raise_this = Exception("too many sandwiches") self.loop.start(0) with self.condition: while not self.calls >= 3: self.condition.wait() self.loop.stop() self.assertGreaterEqual(self.calls, 3)
def _leader_initialize(self): """Performs initialization routines that may require async processing """ if self.enable_loop: if not self.control_loop: self.control_loop = LoopingCall(self._loop_top) self.control_loop.start(300)
def start(self): log.info('EPUAgent starting') self.dashi.handle(self.heartbeat) self.loop = LoopingCall(self._loop) if self.start_beat: log.debug('Starting heartbeat loop - %s second interval', self.period) self.loop.start(self.period) try: self.dashi.consume() except KeyboardInterrupt: log.warning("Caught terminate signal. Exiting") else: log.info("Exiting normally.")
def test_start_stop(self): self.loop = loop = LoopingCall(self.looper, 1, hats=True) loop.start(1) loop.stop() with self.condition: if not self.calls: self.condition.wait(5) self.assertEqual(self.calls, 1) self.assertLastPassed(1, hats=True)
def test_called(self): # looper will stop itself after 3 calls self.max_calls = 3 self.loop = loop = LoopingCall(self.looper, 1, 2, anarg=5) # interval of 0 makes it not block loop.start(0) self.assertTrue(self.stopped.wait(5)) #peek into looping call and join on thread thread = loop.thread if thread: thread.join() self.assertFalse(loop.running) self.assertEqual(self.calls, 3) self.assertPassed(0, 1, 2, anarg=5) self.assertPassed(1, 1, 2, anarg=5) self.assertPassed(2, 1, 2, anarg=5)
def start(self): log.info("starting high availability instance %s" % self) # Set up operations self.dashi.handle(self.reconfigure_policy) self.dashi.handle(self.dump) self.apply_policy_loop = LoopingCall(self.core.apply_policy) self.apply_policy_loop.start(self.policy_interval) try: self.dashi.consume() except KeyboardInterrupt: self.apply_policy_loop.stop() log.warning("Caught terminate signal. Bye!") else: self.apply_policy_loop.stop() log.info("Exiting normally. Bye!")
def _leader_initialize(self): """Performs initialization routines that may require async processing """ # to make certain we have the latest records for instances, we request provisioner to dump state instance_ids = [] for owner, domain_id in self.epum_store.list_domains(): domain = self.epum_store.get_domain(owner, domain_id) with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner): for instance in domain.get_instances(): if instance.state < InstanceState.TERMINATED: instance_ids.append(instance.instance_id) if instance_ids: self.provisioner_client.dump_state(nodes=instance_ids) # TODO: We need to make a decision about how an engine can be configured to fire vs. how the # decider fires it's top-loop. The decider's granularity controls minimums. # WARN: For now the engine-specific "pulse" configuration is ignored. if self.enable_loop: if not self.control_loop: self.control_loop = LoopingCall(self._loop_top) self.control_loop.start(self.loop_interval)
class EPUAgent(object): """Elastic Process Unit (EPU) Agent. Monitors vitals in running VMs. """ def __init__(self, *args, **kwargs): configs = ["epuagent"] config_files = get_config_paths(configs) self.CFG = bootstrap.configure(config_files) topic = self.CFG.epuagent.get('service_name') self.topic = topic or "epu_agent_%s" % uuid.uuid4() heartbeat_dest = kwargs.get('heartbeat_dest') self.heartbeat_dest = heartbeat_dest or self.CFG.epuagent.heartbeat_dest node_id = kwargs.get('node_id') self.node_id = node_id or self.CFG.epuagent.node_id heartbeat_op = kwargs.get('heartbeat_op') self.heartbeat_op = heartbeat_op or self.CFG.epuagent.heartbeat_op period = kwargs.get('period_seconds') self.period = float(period or self.CFG.epuagent.period_seconds) # for testing, allow for not starting heartbeat automatically self.start_beat = kwargs.get('start_heartbeat', True) amqp_uri = kwargs.get('amqp_uri') sock = kwargs.get('supervisor_socket') sock = sock or self.CFG.epuagent.get('supervisor_socket') if sock: log.debug("monitoring a process supervisor at: %s", sock) self.supervisor = Supervisor(sock) else: log.debug("not monitoring process supervisor") self.supervisor = None self.core = EPUAgentCore(self.node_id, supervisor=self.supervisor) self.dashi = bootstrap.dashi_connect(self.topic, self.CFG, amqp_uri) def start(self): log.info('EPUAgent starting') self.dashi.handle(self.heartbeat) self.loop = LoopingCall(self._loop) if self.start_beat: log.debug('Starting heartbeat loop - %s second interval', self.period) self.loop.start(self.period) try: self.dashi.consume() except KeyboardInterrupt: log.warning("Caught terminate signal. Exiting") else: log.info("Exiting normally.") def _loop(self): return self.heartbeat() def heartbeat(self): try: state = self.core.get_state() self.dashi.fire(self.heartbeat_dest, self.heartbeat_op, heartbeat=state) except Exception, e: # unhandled exceptions will terminate the LoopingCall log.error('Error heartbeating: %s', e, exc_info=True)
class EPUMDecider(object): """The decider handles critical sections related to running decision engine cycles. In the future it may farm out subtasks to the EPUM workers (EPUMReactor) but currently all decision engine activity happens directly via the decider role. The instance of the EPUManagementService process that hosts a particular EPUMDecider instance might not be the elected decider. When it is the elected decider, its EPUMDecider instance handles that functionality. When it is not the elected decider, its EPUMDecider instance handles being available in the election. See: https://confluence.oceanobservatories.org/display/syseng/CIAD+CEI+OV+Elastic+Computing See: https://confluence.oceanobservatories.org/display/CIDev/EPUManagement+Refactor "I hear the voices [...] and I know the speculation. But I'm the decider, and I decide what is best." """ def __init__(self, epum_store, subscribers, provisioner_client, epum_client, dtrs_client, disable_loop=False, base_provisioner_vars=None, loop_interval=5.0, statsd_cfg=None): """ @param epum_store State abstraction for all domains @type epum_store EPUMStore @param subscribers A way to signal state changes @param provisioner_client A way to launch/destroy VMs @param epum_client A way to launch subtasks to EPUM workers (reactor roles) @param dtrs_client A way to get information from dtrs @param disable_loop For unit/integration tests, don't run a timed decision loop @param base_provisioner_vars base vars given to every launch """ self.epum_store = epum_store self.subscribers = subscribers self.provisioner_client = provisioner_client self.epum_client = epum_client self.dtrs_client = dtrs_client self.control_loop = None self.enable_loop = not disable_loop self.loop_interval = float(loop_interval) self.is_leader = False # these are given to every launch after engine-provided vars are folded in self.base_provisioner_vars = base_provisioner_vars # The instances of Engine that make the control decisions for each domain self.engines = {} # the versions of the engine configs currently applied self.engine_config_versions = {} # The instances of Control (stateful) that are passed to each Engine to get info and execute cmds self.controls = {} self.statsd_client = None if statsd_cfg is not None: try: host = statsd_cfg["host"] port = statsd_cfg["port"] log.info("Setting up statsd client with host %s and port %d" % (host, port)) self.statsd_client = StatsClient(host, port) except: log.exception("Failed to set up statsd client") def recover(self): """Called whenever the whole EPUManagement instance is instantiated. """ # For callbacks: "now_leader()" and "not_leader()" self.epum_store.register_decider(self) def now_leader(self, block=False): """Called when this instance becomes the decider leader. When block is true, waits until leader dies or is cancelled """ log.info("Elected as Decider leader") self._leader_initialize() self.is_leader = True if block: if self.control_loop: self.control_loop.thread.join() else: raise ValueError("cannot block without a control loop") def not_leader(self): """Called when this instance is known not to be the decider leader. """ if self.control_loop: self.control_loop.stop() self.control_loop = None self.is_leader = False def _leader_initialize(self): """Performs initialization routines that may require async processing """ # to make certain we have the latest records for instances, we request provisioner to dump state instance_ids = [] for owner, domain_id in self.epum_store.list_domains(): domain = self.epum_store.get_domain(owner, domain_id) with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner): for instance in domain.get_instances(): if instance.state < InstanceState.TERMINATED: instance_ids.append(instance.instance_id) if instance_ids: self.provisioner_client.dump_state(nodes=instance_ids) # TODO: We need to make a decision about how an engine can be configured to fire vs. how the # decider fires it's top-loop. The decider's granularity controls minimums. # WARN: For now the engine-specific "pulse" configuration is ignored. if self.enable_loop: if not self.control_loop: self.control_loop = LoopingCall(self._loop_top) self.control_loop.start(self.loop_interval) def _loop_top(self): """Every iteration of the decider loop, the following happens: 1. Refresh state. The EPUM worker processes are constantly updating persistence about the state of instances. We do not suffer from efficiency fears here (without evidence). 2. In particular, refresh the master domain list. Some may have been created/removed in the meantime. Or this could be the first time this decider is the leader and the engine instances need to be created. 3. For each new domain, create an engine instance and initialize it. 4. For each pre-existing domain that is not marked as removed: A. Check if it has been reconfigured in the meantime. If so, call reconfigure on the engine. B. Run decision cycle. """ before = time.time() domains = self.epum_store.get_all_domains() # Perhaps in the meantime, the leader connection failed, bail early if not self.is_leader: return # look for domains that are not active anymore active_domains = {} for domain in domains: with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner): if domain.is_removed(): self._shutdown_domain(domain) else: active_domains[domain.key] = domain if domain.key not in self.engines: # New engines (new to this decider instance, at least) try: self._new_engine(domain) except Exception, e: log.error("Error creating engine '%s' for user '%s': %s", domain.domain_id, domain.owner, str(e), exc_info=True) if self.statsd_client is not None: try: self.statsd_client.gauge("active_domains", len(active_domains)) except: log.exception("Failed to submit metrics") for key in self.engines: # Perhaps in the meantime, the leader connection failed, bail early if not self.is_leader: return domain = active_domains.get(key) if not domain: continue with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner): engine_conf, version = domain.get_versioned_engine_config() if version > self.engine_config_versions[key]: try: self.engines[key].reconfigure(self.controls[key], engine_conf) self.engine_config_versions[key] = version except Exception, e: log.error("Error in reconfigure call for user '%s' domain '%s': %s", domain.owner, domain.domain_id, str(e), exc_info=True) self._get_engine_sensor_state(domain) engine_state = domain.get_engine_state() self._retry_domain_pending_actions(domain, engine_state.instances) try: self.engines[key].decide(self.controls[key], engine_state) except Exception, e: # TODO: if failure, notify creator # TODO: If initialization fails, the engine won't be added to the list and it will be # attempted over and over. There could be a retry limit? Or jut once is enough. log.error("Error in decide call for user '%s' domain '%s': %s", domain.owner, domain.domain_id, str(e), exc_info=True)
class EPUMReaper(object): """This process infrequently queries each domain in the datastore. It finds VM records in a terminal state past the threshold and removes them. The instance of the EPUManagementService process that hosts a particular EPUMReaper instance might not be the elected reaper. When it is the elected reaper, this EPUMReaper instance handles that functionality. When it is not the elected reaper, this EPUMReaper instance handles being available in the election. """ def __init__(self, epum_store, record_reaping_max_age, disable_loop=False): """ @param epum_store State abstraction for all EPUs @param record_reaping_max_age Instance records older than record_reaping_max_age will be deleted @param disable_loop For unit/integration tests, don't run a timed decision loop """ self.epum_store = epum_store self.record_reaping_max_age = record_reaping_max_age self.control_loop = None self.enable_loop = not disable_loop self.is_leader = False def recover(self): """Called whenever the whole EPUManagement instance is instantiated. """ # For callbacks: "now_leader()" and "not_leader()" self.epum_store.register_reaper(self) def now_leader(self, block=False): """Called when this instance becomes the reaper leader. """ log.info("Elected as Reaper leader") self._leader_initialize() self.is_leader = True if block: if self.control_loop: self.control_loop.thread.join() else: raise ValueError("cannot block without a control loop") def not_leader(self): """Called when this instance is known not to be the reaper leader. """ if self.control_loop: self.control_loop.stop() self.control_loop = None self.is_leader = False def _leader_initialize(self): """Performs initialization routines that may require async processing """ if self.enable_loop: if not self.control_loop: self.control_loop = LoopingCall(self._loop_top) self.control_loop.start(300) def _loop_top(self): """Run the reaper loop. Every time this runs, each domain is checked for instances in terminal states TERMINATED, FAILED, or REJECTED. They are deleted if they are older than self.record_reaping_max_age. """ # Perhaps in the meantime, the leader connection failed, bail early if not self.is_leader: return now = time.time() domains = self.epum_store.get_all_domains() for domain in domains: with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner): if not domain.is_removed(): instances = domain.get_instances() for instance in instances: log.info("Instance is " + instance['state']) if instance['state'] in [states.TERMINATED, states.FAILED, states.REJECTED]: state_time = instance['state_time'] if now > state_time + self.record_reaping_max_age: log.info("Removing instance %s with no state change for %f seconds", instance['instance_id'], now - state_time) domain.remove_instance(instance['instance_id']) # Perhaps in the meantime, the leader connection failed, bail early if not self.is_leader: return
class EPUMDoctor(object): """The doctor handles critical sections related to 'pronouncing' a VM instance unhealthy. In the future it may farm out subtasks to the EPUM workers (EPUMReactor) but currently all health-check activity happens directly via the doctor role. The instance of the EPUManagementService process that hosts a particular EPUMDoctor instance might not be the elected doctor. When it is the elected doctor, this EPUMDoctor instance handles that functionality. When it is not the elected doctor, this EPUMDoctor instance handles being available in the election. See: https://confluence.oceanobservatories.org/display/syseng/CIAD+CEI+OV+Elastic+Computing See: https://confluence.oceanobservatories.org/display/CIDev/EPUManagement+Refactor """ def __init__(self, epum_store, notifier, provisioner_client, epum_client, ouagent_client, disable_loop=False): """ @param epum_store State abstraction for all EPUs @param notifier A way to signal state changes @param provisioner_client A way to destroy VMs @param epum_client A way to launch subtasks to EPUM workers (reactor roles) (TODO: not sure if needed) @param ouagent_client See OUAgent dump_state() in architecture documentation @param disable_loop For unit/integration tests, don't run a timed decision loop """ self.epum_store = epum_store self.notifier = notifier self.provisioner_client = provisioner_client self.epum_client = epum_client self.ouagent_client = ouagent_client self.control_loop = None self.enable_loop = not disable_loop self.is_leader = False # The instances of HealthMonitor that make the health decisions for each domain self.monitors = {} def recover(self): """Called whenever the whole EPUManagement instance is instantiated. """ # For callbacks: "now_leader()" and "not_leader()" self.epum_store.register_doctor(self) def now_leader(self, block=False): """Called when this instance becomes the doctor leader. """ log.info("Elected as Doctor leader") self._leader_initialize() self.is_leader = True if block: if self.control_loop: self.control_loop.thread.join() else: raise ValueError("cannot block without a control loop") def not_leader(self): """Called when this instance is known not to be the doctor leader. """ if self.control_loop: self.control_loop.stop() self.control_loop = None self.is_leader = False def _leader_initialize(self): """Performs initialization routines that may require async processing """ if self.enable_loop: if not self.control_loop: self.control_loop = LoopingCall(self._loop_top) self.control_loop.start(10) def _loop_top(self, timestamp=None): """ Run the doctor decider loop. Every time this runs, each domain's health monitor is loaded and """ # Perhaps in the meantime, the leader connection failed, bail early if not self.is_leader: return domains = self.epum_store.get_all_domains() active_domains = {} for domain in domains: with EpuLoggerThreadSpecific(domain=domain.domain_id, user=domain.owner): if not domain.is_removed(): active_domains[domain.key] = domain # Perhaps in the meantime, the leader connection failed, bail early if not self.is_leader: return # Monitors that are not active anymore for key in self.monitors.keys(): if key not in active_domains: del self.monitors[key] # New health monitors (new to this doctor instance, at least) for domain_key in filter(lambda x: x not in self.monitors, active_domains.iterkeys()): try: self._new_monitor(active_domains[domain_key]) except Exception, e: log.error("Error creating health monitor for '%s': %s", domain_key, str(e), exc_info=True) for domain_key in self.monitors.keys(): # Perhaps in the meantime, the leader connection failed, bail early if not self.is_leader: return try: self.monitors[domain_key].update(timestamp) except Exception, e: log.error("Error in doctor's update call for '%s': %s", domain_key, str(e), exc_info=True)
def test_start_stop_2(self): self.loop = loop = LoopingCall(self.looper, 1, hats=True) loop.start(1, now=False) loop.stop() self.assertEqual(self.calls, 0)
class LoopingCallTests(unittest.TestCase): def setUp(self): self.calls = 0 self.passed = [] self.condition = threading.Condition() self.loop = None # tests can set this to make looper stop itself after a specified # number of calls. self.loop must also be set. self.max_calls = None # tests can set this to make looper raise an exception self.raise_this = None # when looper kills itself, it will set this event self.stopped = threading.Event() def tearDown(self): if self.loop: # peek into loop and make sure thread is joined self.loop.stop() thread = self.loop.thread if thread: thread.join() def assertPassed(self, index, *args, **kwargs): passed_args, passed_kwargs = self.passed[index] self.assertEqual(args, passed_args) self.assertEqual(kwargs, passed_kwargs) def assertLastPassed(self, *args, **kwargs): self.assertPassed(-1, *args, **kwargs) def looper(self, *args, **kwargs): with self.condition: self.calls += 1 self.passed.append((args, kwargs)) self.condition.notifyAll() if self.max_calls and self.calls >= self.max_calls: self.loop.stop() self.stopped.set() if self.raise_this: raise self.raise_this def test_start_stop(self): self.loop = loop = LoopingCall(self.looper, 1, hats=True) loop.start(1) loop.stop() with self.condition: if not self.calls: self.condition.wait(5) self.assertEqual(self.calls, 1) self.assertLastPassed(1, hats=True) def test_start_stop_2(self): self.loop = loop = LoopingCall(self.looper, 1, hats=True) loop.start(1, now=False) loop.stop() self.assertEqual(self.calls, 0) def test_called(self): # looper will stop itself after 3 calls self.max_calls = 3 self.loop = loop = LoopingCall(self.looper, 1, 2, anarg=5) # interval of 0 makes it not block loop.start(0) self.assertTrue(self.stopped.wait(5)) #peek into looping call and join on thread thread = loop.thread if thread: thread.join() self.assertFalse(loop.running) self.assertEqual(self.calls, 3) self.assertPassed(0, 1, 2, anarg=5) self.assertPassed(1, 1, 2, anarg=5) self.assertPassed(2, 1, 2, anarg=5) def test_error_caught(self): self.loop = LoopingCall(self.looper) self.raise_this = Exception("too many sandwiches") self.loop.start(0) with self.condition: while not self.calls >= 3: self.condition.wait() self.loop.stop() self.assertGreaterEqual(self.calls, 3)
class HighAvailabilityService(object): def __init__(self, *args, **kwargs): configs = ["service", "highavailability"] config_files = get_config_paths(configs) self.CFG = bootstrap.configure(config_files) exchange = kwargs.get('exchange') if exchange: self.CFG.server.amqp.exchange = exchange self.topic = kwargs.get('service_name') or self.CFG.highavailability.get('service_name') or DEFAULT_TOPIC self.amqp_uri = kwargs.get('amqp_uri') or None self.dashi = bootstrap.dashi_connect(self.topic, self.CFG, self.amqp_uri, sysname=kwargs.get('sysname')) process_dispatchers = (kwargs.get('process_dispatchers') or self.CFG.highavailability.processdispatchers) policy_name = self.CFG.highavailability.policy.name try: policy_map[policy_name.lower()] self.policy = policy_name.lower() except KeyError: raise Exception("HA Service doesn't support '%s' policy" % policy_name) policy_parameters = (kwargs.get('policy_parameters') or self.CFG.highavailability.policy.parameters) process_definition_id = (kwargs.get('process_definition_id') or self.CFG.highavailability.process_definition_id) self.policy_interval = (kwargs.get('policy_interval') or self.CFG.highavailability.policy.interval) self.control = DashiHAProcessControl(self.dashi, process_dispatchers) core = HighAvailabilityCore self.core = core(self.CFG.highavailability, self.control, process_dispatchers, self.policy, parameters=policy_parameters, process_definition_id=process_definition_id) def start(self): log.info("starting high availability instance %s" % self) # Set up operations self.dashi.handle(self.reconfigure_policy) self.dashi.handle(self.dump) self.apply_policy_loop = LoopingCall(self.core.apply_policy) self.apply_policy_loop.start(self.policy_interval) try: self.dashi.consume() except KeyboardInterrupt: self.apply_policy_loop.stop() log.warning("Caught terminate signal. Bye!") else: self.apply_policy_loop.stop() log.info("Exiting normally. Bye!") def stop(self): self.dashi.cancel() self.dashi.disconnect() def reconfigure_policy(self, new_policy): """Service operation: Change the parameters of the policy used for service @param new_policy: parameters of policy @return: """ self.core.reconfigure_policy(new_policy) def status(self): """Service operation: Get the status of the HA Service @return: {PENDING, READY, STEADY, BROKEN} """ return self.core.status() def dump(self): """Dump state of ha core """ return self.core.dump()