def __init__(self, provisioner_client, engineclass, controller_name, conf=None, state=None, store=None): if state: self.state = state else: self.state = ControllerCoreState(store or ControllerStore()) prov_vars = None health_kwargs = None if conf: if conf.has_key(PROVISIONER_VARS_KEY): prov_vars = conf[PROVISIONER_VARS_KEY] if conf.get(MONITOR_HEALTH_KEY): health_kwargs = {} if HEALTH_BOOT_KEY in conf: health_kwargs['boot_seconds'] = conf[HEALTH_BOOT_KEY] if HEALTH_MISSING_KEY in conf: health_kwargs['missing_seconds'] = conf[HEALTH_MISSING_KEY] if HEALTH_ZOMBIE_KEY in conf: health_kwargs['zombie_seconds'] = conf[HEALTH_ZOMBIE_KEY] self.conf = conf if health_kwargs is not None: self.health_monitor = HealthMonitor(self.state, **health_kwargs) else: self.health_monitor = None # There can only ever be one 'reconfigure' or 'decide' engine call run # at ANY time. The 'decide' call is triggered via timed looping call # and 'reconfigure' is triggered asynchronously at any moment. self.busy = defer.DeferredSemaphore(1) self.provisioner_client = provisioner_client health_not_checked = self.health_monitor is None self.control = ControllerCoreControl( provisioner_client, self.state, prov_vars, controller_name, health_not_checked=health_not_checked) self.engine = EngineLoader().load(engineclass) self.control_loop = None
def test_process_error(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now) yield self.ok_heartbeat(node, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 procs = [{ 'name': 'proc1', 'stderr': 'faaaaaail', 'state': 100, 'exitcode': -1, 'stop_timestamp': 25242 }] yield self.err_heartbeat(node, now, procs) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail') procs[0].pop('stderr') now = 8 yield self.err_heartbeat(node, now, procs) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail')
def test_error(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now) yield self.ok_heartbeat(node, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 yield self.err_heartbeat(node, now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0], 'faiiiill') yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)
def test_recovery(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=100) nodes = ["n" + str(i + 1) for i in range(7)] n1, n2, n3, n4, n5, n6, n7 = nodes # set up some instances that reached their iaas_state before the # init time (100) # this one has been running for well longer than the missing timeout # and we will have not received a heartbeat. It shouldn't be marked # MISSING until more than 5 seconds after the init_time self.state.new_fake_instance_state(n1, InstanceStates.RUNNING, 50, InstanceHealthState.OK) # this has been running for 10 seconds before the init time but we # have never received a heartbeat. It should be marked as MISSING # after the boot timeout expires, starting from the init time. self.state.new_fake_instance_state(n2, InstanceStates.RUNNING, 90, InstanceHealthState.UNKNOWN) # is terminated and nothing should happen self.state.new_fake_instance_state(n3, InstanceStates.TERMINATED, 90, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 110, just before it would be # marked MISSING self.state.new_fake_instance_state(n4, InstanceStates.RUNNING, 95, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 105, just before it would be # marked MISSING self.state.new_fake_instance_state(n5, InstanceStates.RUNNING, 95, InstanceHealthState.OK) # this instance was already marked as errored before the recovery self.state.new_fake_instance_state(n6, InstanceStates.RUNNING, 95, InstanceHealthState.PROCESS_ERROR) # this instance was a ZOMBIE, it should be initially marked back as # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again self.state.new_fake_instance_state(n7, InstanceStates.TERMINATED, 80, InstanceHealthState.ZOMBIE) yield self.monitor.update(100) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) yield self.monitor.update(105) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.ok_heartbeat(n5, 105) self.ok_heartbeat(n7, 105) # this one will be relabeled as a zombie self.err_heartbeat(n6, 105, procs=['a']) yield self.monitor.update(106) self.assertNodeState(InstanceHealthState.OK, n5) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n5, 110) yield self.monitor.update(110) self.assertNodeState(InstanceHealthState.OK, n5) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n4, 110) self.err_heartbeat(n6, 110, procs=['a']) yield self.monitor.update(111) self.assertNodeState(InstanceHealthState.OK, n5, n4) self.assertNodeState(InstanceHealthState.MISSING, n1, n2) self.assertNodeState(InstanceHealthState.UNKNOWN, n3) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7)
def test_basic(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) nodes = [str(uuid.uuid4()) for i in range(3)] n1, n2, n3 = nodes # not using real timestamps now = 0 for n in nodes: self.state.new_fake_instance_state(n, InstanceStates.RUNNING, now) # all nodes are running but haven't been heard from self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) now = 5 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) # first heartbeat to n1 yield self.ok_heartbeat(n1, now) self.assertNodeState(InstanceHealthState.OK, n1) now = 10 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3) yield self.ok_heartbeat(n1, now) # n1 makes it in under the wire yield self.ok_heartbeat(n2, now) now = 11 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1, n2) self.assertNodeState(InstanceHealthState.MISSING, n3) yield self.ok_heartbeat(n3, now) self.assertNodeState(InstanceHealthState.OK, *nodes) # ok don't hear from n2 for a while, should go missing now = 13 yield self.ok_heartbeat(n1, now) now = 16 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1, n3) self.assertNodeState(InstanceHealthState.MISSING, n2) yield self.ok_heartbeat(n2, now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 20 # roll all nodes to terminated in IaaS for n in nodes: self.state.new_fake_instance_state(n, InstanceStates.TERMINATED, now) # been longer than missing window for n1 but shouldn't matter yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 30 yield self.ok_heartbeat(n1, now) yield self.monitor.update(now) # not a zombie yet self.assertNodeState(InstanceHealthState.OK, *nodes) now = 31 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1) yield self.ok_heartbeat(n1, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.ZOMBIE, n1) now = 42 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, n1)